diff --git a/Cargo.lock b/Cargo.lock index 2de9007f5..91c83fb13 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -489,6 +489,11 @@ version = "0.22.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" +[[package]] +name = "bbqueue" +version = "0.5.1" +source = "git+https://github.com/meilisearch/bbqueue#cbb87cc707b5af415ef203bdaf2443e06ba0d6d4" + [[package]] name = "benchmarks" version = "1.12.0" @@ -701,6 +706,20 @@ dependencies = [ "serde", ] +[[package]] +name = "bumparaw-collections" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ce682bdc86c2e25ef5cd95881d9d6a1902214eddf74cf9ffea88fe1464377e8" +dependencies = [ + "allocator-api2", + "bitpacking", + "bumpalo", + "hashbrown 0.15.1", + "serde", + "serde_json", +] + [[package]] name = "byte-unit" version = "5.1.4" @@ -969,8 +988,9 @@ dependencies = [ [[package]] name = "charabia" -version = "0.9.1" -source = "git+https://github.com/meilisearch/charabia?branch=mutualize-char-normalizer#f8d8308cdb8db80819be7eeed5652cc4a995cc71" +version = "0.9.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf8921fe4d53ab8f9e8f9b72ce6f91726cfc40fffab1243d27db406b5e2e9cc2" dependencies = [ "aho-corasick", "csv", @@ -1245,19 +1265,6 @@ dependencies = [ "itertools 0.10.5", ] -[[package]] -name = "crossbeam" -version = "0.8.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1137cd7e7fc0fb5d3c5a8678be38ec56e819125d8d7907411fe24ccb943faca8" -dependencies = [ - "crossbeam-channel", - "crossbeam-deque", - "crossbeam-epoch", - "crossbeam-queue", - "crossbeam-utils", -] - [[package]] name = "crossbeam-channel" version = "0.5.13" @@ -1917,6 +1924,15 @@ dependencies = [ "serde_json", ] +[[package]] +name = "flume" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da0e4dd2a88388a1f4ccc7c9ce104604dab68d9f408dc34cd45823d5a9069095" +dependencies = [ + "spin", +] + [[package]] name = "fnv" version = "1.0.7" @@ -2263,8 +2279,9 @@ checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" [[package]] name = "grenad" -version = "0.4.7" -source = "git+https://github.com/meilisearch/grenad?branch=various-improvements#58ac87d852413571102f44c5e55ca13509a3f1a0" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0e2ac9baf835ee2a7f0622a5617792ced6f65af25994078c343d429431ef2bbc" dependencies = [ "bytemuck", "byteorder", @@ -2614,7 +2631,9 @@ dependencies = [ "big_s", "bincode", "bumpalo", - "crossbeam", + "bumparaw-collections", + "convert_case 0.6.0", + "crossbeam-channel", "csv", "derive_builder 0.20.0", "dump", @@ -2628,7 +2647,6 @@ dependencies = [ "meilisearch-types", "memmap2", "page_size", - "raw-collections", "rayon", "roaring", "serde", @@ -2644,12 +2662,12 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.2.6" +version = "2.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "168fb715dda47215e360912c096649d23d58bf392ac62f73919e831745e40f26" +checksum = "62f822373a4fe84d4bb149bf54e584a7f4abec90e072ed49cda0edea5b95471f" dependencies = [ "equivalent", - "hashbrown 0.14.3", + "hashbrown 0.15.1", "serde", ] @@ -2708,7 +2726,8 @@ checksum = "28b29a3cd74f0f4598934efe3aeba42bae0eb4680554128851ebbecb02af14e6" [[package]] name = "irg-kvariants" version = "0.1.1" -source = "git+https://github.com/meilisearch/charabia?branch=mutualize-char-normalizer#f8d8308cdb8db80819be7eeed5652cc4a995cc71" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef2af7c331f2536964a32b78a7d2e0963d78b42f4a76323b16cc7d94b1ddce26" dependencies = [ "csv", "once_cell", @@ -3545,6 +3564,7 @@ dependencies = [ "actix-web", "anyhow", "bumpalo", + "bumparaw-collections", "convert_case 0.6.0", "csv", "deserr", @@ -3557,8 +3577,8 @@ dependencies = [ "meili-snap", "memmap2", "milli", - "raw-collections", "roaring", + "rustc-hash 2.1.0", "serde", "serde-cs", "serde_json", @@ -3579,9 +3599,12 @@ dependencies = [ "clap", "dump", "file-store", + "indexmap", "meilisearch-auth", "meilisearch-types", "serde", + "serde_json", + "tempfile", "time", "uuid", ] @@ -3608,11 +3631,13 @@ version = "1.12.0" dependencies = [ "allocator-api2", "arroy 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)", + "bbqueue", "big_s", "bimap", "bincode", "bstr", "bumpalo", + "bumparaw-collections", "bytemuck", "byteorder", "candle-core", @@ -3627,6 +3652,7 @@ dependencies = [ "enum-iterator", "filter-parser", "flatten-serde-json", + "flume", "fst", "fxhash", "geoutils", @@ -3650,13 +3676,12 @@ dependencies = [ "once_cell", "ordered-float", "rand", - "raw-collections", "rayon", "rayon-par-bridge", "rhai", "roaring", "rstar", - "rustc-hash 2.0.0", + "rustc-hash 2.1.0", "serde", "serde_json", "slice-group-by", @@ -3912,7 +3937,8 @@ dependencies = [ [[package]] name = "obkv" version = "0.3.0" -source = "git+https://github.com/kerollmops/obkv?branch=unsized-kvreader#ce535874008ecac554f02e0c670e6caf62134d6b" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae4512a8f418ac322335255a72361b9ac927e106f4d7fe6ab4d8ac59cb01f7a9" [[package]] name = "once_cell" @@ -4404,7 +4430,7 @@ dependencies = [ "bytes", "rand", "ring", - "rustc-hash 2.0.0", + "rustc-hash 2.1.0", "rustls", "slab", "thiserror", @@ -4480,19 +4506,6 @@ dependencies = [ "rand", ] -[[package]] -name = "raw-collections" -version = "0.1.0" -source = "git+https://github.com/meilisearch/raw-collections.git#15e5d7bdebc0c149b2a28b2454f307c717d07f8a" -dependencies = [ - "allocator-api2", - "bitpacking", - "bumpalo", - "hashbrown 0.15.1", - "serde", - "serde_json", -] - [[package]] name = "raw-cpuid" version = "10.7.0" @@ -4739,8 +4752,9 @@ dependencies = [ [[package]] name = "roaring" -version = "0.10.6" -source = "git+https://github.com/RoaringBitmap/roaring-rs?branch=clone-iter-slice#8ff028e484fb6192a0acf5a669eaf18c30cada6e" +version = "0.10.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f81dc953b2244ddd5e7860cb0bb2a790494b898ef321d4aff8e260efab60cc88" dependencies = [ "bytemuck", "byteorder", @@ -4789,9 +4803,9 @@ checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" [[package]] name = "rustc-hash" -version = "2.0.0" +version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "583034fd73374156e66797ed8e5b0d5690409c9226b22d87cb7f19821c05d152" +checksum = "c7fb8039b3032c191086b10f11f319a6e99e1e82889c5cc6046f515c9db1d497" [[package]] name = "rustc_version" @@ -4960,9 +4974,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.132" +version = "1.0.133" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d726bfaff4b320266d395898905d0eba0345aae23b54aee3a737e260fd46db03" +checksum = "c7fceb2473b9166b2294ef05efcb65a3db80803f0b03ef86a5fc88a2b85ee377" dependencies = [ "indexmap", "itoa", @@ -5182,6 +5196,9 @@ name = "spin" version = "0.9.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" +dependencies = [ + "lock_api", +] [[package]] name = "spm_precompiled" @@ -6015,9 +6032,9 @@ dependencies = [ [[package]] name = "wana_kana" -version = "3.0.0" +version = "4.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "477976a5c56fb7b014795df5a2ce08d2de8bcd4d5980844c5bd3978a7fd1c30b" +checksum = "a74666202acfcb4f9b995be2e3e9f7f530deb65e05a1407b8d0b30c9c451238a" dependencies = [ "fnv", "itertools 0.10.5", diff --git a/Cargo.toml b/Cargo.toml index 5e53dbfa5..89a17d8fc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -43,6 +43,3 @@ opt-level = 3 opt-level = 3 [profile.dev.package.roaring] opt-level = 3 - -[patch.crates-io] -roaring = { git = "https://github.com/RoaringBitmap/roaring-rs", branch = "clone-iter-slice" } diff --git a/crates/benchmarks/Cargo.toml b/crates/benchmarks/Cargo.toml index eec30ea3f..ccd256546 100644 --- a/crates/benchmarks/Cargo.toml +++ b/crates/benchmarks/Cargo.toml @@ -24,7 +24,7 @@ tempfile = "3.14.0" criterion = { version = "0.5.1", features = ["html_reports"] } rand = "0.8.5" rand_chacha = "0.3.1" -roaring = "0.10.6" +roaring = "0.10.7" [build-dependencies] anyhow = "1.0.86" diff --git a/crates/benchmarks/benches/indexing.rs b/crates/benchmarks/benches/indexing.rs index 2f33c3454..4acd7b22a 100644 --- a/crates/benchmarks/benches/indexing.rs +++ b/crates/benchmarks/benches/indexing.rs @@ -8,6 +8,7 @@ use bumpalo::Bump; use criterion::{criterion_group, criterion_main, Criterion}; use milli::documents::PrimaryKey; use milli::heed::{EnvOpenOptions, RwTxn}; +use milli::progress::Progress; use milli::update::new::indexer; use milli::update::{IndexDocumentsMethod, IndexerConfig, Settings}; use milli::vector::EmbeddingConfigs; @@ -16,6 +17,7 @@ use rand::seq::SliceRandom; use rand_chacha::rand_core::SeedableRng; use roaring::RoaringBitmap; +#[cfg(not(windows))] #[global_allocator] static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc; @@ -150,13 +152,14 @@ fn indexing_songs_default(c: &mut Criterion) { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -164,7 +167,7 @@ fn indexing_songs_default(c: &mut Criterion) { &document_changes, EmbeddingConfigs::default(), &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); @@ -216,13 +219,14 @@ fn reindexing_songs_default(c: &mut Criterion) { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -230,7 +234,7 @@ fn reindexing_songs_default(c: &mut Criterion) { &document_changes, EmbeddingConfigs::default(), &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); @@ -260,13 +264,14 @@ fn reindexing_songs_default(c: &mut Criterion) { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -274,7 +279,7 @@ fn reindexing_songs_default(c: &mut Criterion) { &document_changes, EmbeddingConfigs::default(), &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); @@ -328,13 +333,14 @@ fn deleting_songs_in_batches_default(c: &mut Criterion) { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -342,7 +348,7 @@ fn deleting_songs_in_batches_default(c: &mut Criterion) { &document_changes, EmbeddingConfigs::default(), &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); @@ -404,13 +410,14 @@ fn indexing_songs_in_three_batches_default(c: &mut Criterion) { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -418,7 +425,7 @@ fn indexing_songs_in_three_batches_default(c: &mut Criterion) { &document_changes, EmbeddingConfigs::default(), &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); @@ -448,13 +455,14 @@ fn indexing_songs_in_three_batches_default(c: &mut Criterion) { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -462,7 +470,7 @@ fn indexing_songs_in_three_batches_default(c: &mut Criterion) { &document_changes, EmbeddingConfigs::default(), &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); @@ -488,13 +496,14 @@ fn indexing_songs_in_three_batches_default(c: &mut Criterion) { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -502,7 +511,7 @@ fn indexing_songs_in_three_batches_default(c: &mut Criterion) { &document_changes, EmbeddingConfigs::default(), &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); @@ -555,13 +564,14 @@ fn indexing_songs_without_faceted_numbers(c: &mut Criterion) { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -569,7 +579,7 @@ fn indexing_songs_without_faceted_numbers(c: &mut Criterion) { &document_changes, EmbeddingConfigs::default(), &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); @@ -621,13 +631,14 @@ fn indexing_songs_without_faceted_fields(c: &mut Criterion) { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -635,7 +646,7 @@ fn indexing_songs_without_faceted_fields(c: &mut Criterion) { &document_changes, EmbeddingConfigs::default(), &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); @@ -687,13 +698,14 @@ fn indexing_wiki(c: &mut Criterion) { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -701,7 +713,7 @@ fn indexing_wiki(c: &mut Criterion) { &document_changes, EmbeddingConfigs::default(), &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); @@ -752,13 +764,14 @@ fn reindexing_wiki(c: &mut Criterion) { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -766,7 +779,7 @@ fn reindexing_wiki(c: &mut Criterion) { &document_changes, EmbeddingConfigs::default(), &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); @@ -796,13 +809,14 @@ fn reindexing_wiki(c: &mut Criterion) { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -810,7 +824,7 @@ fn reindexing_wiki(c: &mut Criterion) { &document_changes, EmbeddingConfigs::default(), &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); @@ -863,13 +877,14 @@ fn deleting_wiki_in_batches_default(c: &mut Criterion) { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -877,7 +892,7 @@ fn deleting_wiki_in_batches_default(c: &mut Criterion) { &document_changes, EmbeddingConfigs::default(), &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); @@ -939,13 +954,14 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -953,7 +969,7 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) { &document_changes, EmbeddingConfigs::default(), &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); @@ -984,13 +1000,14 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -998,7 +1015,7 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) { &document_changes, EmbeddingConfigs::default(), &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); @@ -1025,13 +1042,14 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -1039,7 +1057,7 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) { &document_changes, EmbeddingConfigs::default(), &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); @@ -1091,13 +1109,14 @@ fn indexing_movies_default(c: &mut Criterion) { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -1105,7 +1124,7 @@ fn indexing_movies_default(c: &mut Criterion) { &document_changes, EmbeddingConfigs::default(), &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); @@ -1156,13 +1175,14 @@ fn reindexing_movies_default(c: &mut Criterion) { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -1170,7 +1190,7 @@ fn reindexing_movies_default(c: &mut Criterion) { &document_changes, EmbeddingConfigs::default(), &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); @@ -1200,13 +1220,14 @@ fn reindexing_movies_default(c: &mut Criterion) { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -1214,7 +1235,7 @@ fn reindexing_movies_default(c: &mut Criterion) { &document_changes, EmbeddingConfigs::default(), &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); @@ -1267,13 +1288,14 @@ fn deleting_movies_in_batches_default(c: &mut Criterion) { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -1281,7 +1303,7 @@ fn deleting_movies_in_batches_default(c: &mut Criterion) { &document_changes, EmbeddingConfigs::default(), &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); @@ -1321,6 +1343,7 @@ fn delete_documents_from_ids(index: Index, document_ids_to_delete: Vec Index { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -124,7 +126,7 @@ pub fn base_setup(conf: &Conf) -> Index { &document_changes, EmbeddingConfigs::default(), &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); diff --git a/crates/dump/Cargo.toml b/crates/dump/Cargo.toml index f9d2a9a0b..679a97b4e 100644 --- a/crates/dump/Cargo.toml +++ b/crates/dump/Cargo.toml @@ -17,7 +17,7 @@ http = "1.1.0" meilisearch-types = { path = "../meilisearch-types" } once_cell = "1.19.0" regex = "1.10.5" -roaring = { version = "0.10.6", features = ["serde"] } +roaring = { version = "0.10.7", features = ["serde"] } serde = { version = "1.0.204", features = ["derive"] } serde_json = { version = "1.0.120", features = ["preserve_order"] } tar = "0.4.41" diff --git a/crates/dump/src/lib.rs b/crates/dump/src/lib.rs index 8bed7f0d4..31cd3028e 100644 --- a/crates/dump/src/lib.rs +++ b/crates/dump/src/lib.rs @@ -292,6 +292,8 @@ pub(crate) mod test { embedders: Setting::NotSet, search_cutoff_ms: Setting::NotSet, localized_attributes: Setting::NotSet, + facet_search: Setting::NotSet, + prefix_search: Setting::NotSet, _kind: std::marker::PhantomData, }; settings.check() diff --git a/crates/dump/src/reader/compat/v5_to_v6.rs b/crates/dump/src/reader/compat/v5_to_v6.rs index 785542cce..6b2655bdf 100644 --- a/crates/dump/src/reader/compat/v5_to_v6.rs +++ b/crates/dump/src/reader/compat/v5_to_v6.rs @@ -382,6 +382,8 @@ impl From> for v6::Settings { embedders: v6::Setting::NotSet, localized_attributes: v6::Setting::NotSet, search_cutoff_ms: v6::Setting::NotSet, + facet_search: v6::Setting::NotSet, + prefix_search: v6::Setting::NotSet, _kind: std::marker::PhantomData, } } diff --git a/crates/file-store/src/lib.rs b/crates/file-store/src/lib.rs index c8b3849ab..39ed9482b 100644 --- a/crates/file-store/src/lib.rs +++ b/crates/file-store/src/lib.rs @@ -136,6 +136,14 @@ pub struct File { } impl File { + pub fn from_parts(path: PathBuf, file: Option) -> Self { + Self { path, file } + } + + pub fn into_parts(self) -> (PathBuf, Option) { + (self.path, self.file) + } + pub fn dry_file() -> Result { Ok(Self { path: PathBuf::new(), file: None }) } diff --git a/crates/fuzzers/src/bin/fuzz-indexing.rs b/crates/fuzzers/src/bin/fuzz-indexing.rs index f335938b9..08711e5e3 100644 --- a/crates/fuzzers/src/bin/fuzz-indexing.rs +++ b/crates/fuzzers/src/bin/fuzz-indexing.rs @@ -10,6 +10,7 @@ use either::Either; use fuzzers::Operation; use milli::documents::mmap_from_objects; use milli::heed::EnvOpenOptions; +use milli::progress::Progress; use milli::update::new::indexer; use milli::update::{IndexDocumentsMethod, IndexerConfig}; use milli::vector::EmbeddingConfigs; @@ -128,13 +129,14 @@ fn main() { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), indexer_config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -142,7 +144,7 @@ fn main() { &document_changes, embedders, &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); diff --git a/crates/index-scheduler/Cargo.toml b/crates/index-scheduler/Cargo.toml index 657dd6dfe..ec2f17f84 100644 --- a/crates/index-scheduler/Cargo.toml +++ b/crates/index-scheduler/Cargo.toml @@ -13,6 +13,9 @@ license.workspace = true [dependencies] anyhow = "1.0.86" bincode = "1.3.3" +bumpalo = "3.16.0" +bumparaw-collections = "0.1.2" +convert_case = "0.6.0" csv = "1.3.0" derive_builder = "0.20.0" dump = { path = "../dump" } @@ -21,16 +24,15 @@ file-store = { path = "../file-store" } flate2 = "1.0.30" meilisearch-auth = { path = "../meilisearch-auth" } meilisearch-types = { path = "../meilisearch-types" } +memmap2 = "0.9.4" page_size = "0.6.0" -raw-collections = { git = "https://github.com/meilisearch/raw-collections.git", version = "0.1.0" } rayon = "1.10.0" -roaring = { version = "0.10.6", features = ["serde"] } +roaring = { version = "0.10.7", features = ["serde"] } serde = { version = "1.0.204", features = ["derive"] } serde_json = { version = "1.0.120", features = ["preserve_order"] } synchronoise = "1.0.1" tempfile = "3.10.1" thiserror = "1.0.61" -memmap2 = "0.9.4" time = { version = "0.3.36", features = [ "serde-well-known", "formatting", @@ -40,12 +42,11 @@ time = { version = "0.3.36", features = [ tracing = "0.1.40" ureq = "2.10.0" uuid = { version = "1.10.0", features = ["serde", "v4"] } -bumpalo = "3.16.0" [dev-dependencies] arroy = "0.5.0" big_s = "1.0.2" -crossbeam = "0.8.4" +crossbeam-channel = "0.5.13" insta = { version = "1.39.0", features = ["json", "redactions"] } maplit = "1.0.2" meili-snap = { path = "../meili-snap" } diff --git a/crates/index-scheduler/src/autobatcher.rs b/crates/index-scheduler/src/autobatcher.rs index 0f6aa8a3a..5950e2b13 100644 --- a/crates/index-scheduler/src/autobatcher.rs +++ b/crates/index-scheduler/src/autobatcher.rs @@ -115,13 +115,6 @@ pub enum BatchKind { allow_index_creation: bool, settings_ids: Vec, }, - SettingsAndDocumentOperation { - settings_ids: Vec, - method: IndexDocumentsMethod, - allow_index_creation: bool, - primary_key: Option, - operation_ids: Vec, - }, Settings { allow_index_creation: bool, settings_ids: Vec, @@ -146,7 +139,6 @@ impl BatchKind { match self { BatchKind::DocumentOperation { allow_index_creation, .. } | BatchKind::ClearAndSettings { allow_index_creation, .. } - | BatchKind::SettingsAndDocumentOperation { allow_index_creation, .. } | BatchKind::Settings { allow_index_creation, .. } => Some(*allow_index_creation), _ => None, } @@ -154,10 +146,7 @@ impl BatchKind { fn primary_key(&self) -> Option> { match self { - BatchKind::DocumentOperation { primary_key, .. } - | BatchKind::SettingsAndDocumentOperation { primary_key, .. } => { - Some(primary_key.as_deref()) - } + BatchKind::DocumentOperation { primary_key, .. } => Some(primary_key.as_deref()), _ => None, } } @@ -275,8 +264,7 @@ impl BatchKind { Break(BatchKind::IndexDeletion { ids }) } ( - BatchKind::ClearAndSettings { settings_ids: mut ids, allow_index_creation: _, mut other } - | BatchKind::SettingsAndDocumentOperation { operation_ids: mut ids, method: _, allow_index_creation: _, primary_key: _, settings_ids: mut other }, + BatchKind::ClearAndSettings { settings_ids: mut ids, allow_index_creation: _, mut other }, K::IndexDeletion, ) => { ids.push(id); @@ -356,15 +344,9 @@ impl BatchKind { ) => Break(this), ( - BatchKind::DocumentOperation { method, allow_index_creation, primary_key, operation_ids }, + this @ BatchKind::DocumentOperation { .. }, K::Settings { .. }, - ) => Continue(BatchKind::SettingsAndDocumentOperation { - settings_ids: vec![id], - method, - allow_index_creation, - primary_key, - operation_ids, - }), + ) => Break(this), (BatchKind::DocumentDeletion { mut deletion_ids, includes_by_filter: _ }, K::DocumentClear) => { deletion_ids.push(id); @@ -477,63 +459,7 @@ impl BatchKind { allow_index_creation, }) } - ( - BatchKind::SettingsAndDocumentOperation { settings_ids, method: _, mut operation_ids, allow_index_creation, primary_key: _ }, - K::DocumentClear, - ) => { - operation_ids.push(id); - Continue(BatchKind::ClearAndSettings { - settings_ids, - other: operation_ids, - allow_index_creation, - }) - } - ( - BatchKind::SettingsAndDocumentOperation { settings_ids, method: ReplaceDocuments, mut operation_ids, allow_index_creation, primary_key: _}, - K::DocumentImport { method: ReplaceDocuments, primary_key: pk2, .. }, - ) => { - operation_ids.push(id); - Continue(BatchKind::SettingsAndDocumentOperation { - settings_ids, - method: ReplaceDocuments, - allow_index_creation, - primary_key: pk2, - operation_ids, - }) - } - ( - BatchKind::SettingsAndDocumentOperation { settings_ids, method: UpdateDocuments, allow_index_creation, primary_key: _, mut operation_ids }, - K::DocumentImport { method: UpdateDocuments, primary_key: pk2, .. }, - ) => { - operation_ids.push(id); - Continue(BatchKind::SettingsAndDocumentOperation { - settings_ids, - method: UpdateDocuments, - allow_index_creation, - primary_key: pk2, - operation_ids, - }) - } - // But we can't batch a settings and a doc op with another doc op - // this MUST be AFTER the two previous branch - ( - this @ BatchKind::SettingsAndDocumentOperation { .. }, - K::DocumentDeletion { .. } | K::DocumentImport { .. }, - ) => Break(this), - ( - BatchKind::SettingsAndDocumentOperation { mut settings_ids, method, allow_index_creation,primary_key, operation_ids }, - K::Settings { .. }, - ) => { - settings_ids.push(id); - Continue(BatchKind::SettingsAndDocumentOperation { - settings_ids, - method, - allow_index_creation, - primary_key, - operation_ids, - }) - } ( BatchKind::IndexCreation { .. } | BatchKind::IndexDeletion { .. } @@ -808,30 +734,30 @@ mod tests { } #[test] - fn document_addition_batch_with_settings() { + fn document_addition_doesnt_batch_with_settings() { // simple case - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), settings(true)]), @"Some((SettingsAndDocumentOperation { settings_ids: [1], method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), settings(true)]), @"Some((SettingsAndDocumentOperation { settings_ids: [1], method: UpdateDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), settings(true)]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), settings(true)]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); // multiple settings and doc addition - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), doc_imp(ReplaceDocuments, true, None), settings(true), settings(true)]), @"Some((SettingsAndDocumentOperation { settings_ids: [2, 3], method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0, 1] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), doc_imp(ReplaceDocuments, true, None), settings(true), settings(true)]), @"Some((SettingsAndDocumentOperation { settings_ids: [2, 3], method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0, 1] }, true))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), doc_imp(ReplaceDocuments, true, None), settings(true), settings(true)]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0, 1] }, true))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), doc_imp(ReplaceDocuments, true, None), settings(true), settings(true)]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0, 1] }, true))"); // addition and setting unordered - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), settings(true), doc_imp(ReplaceDocuments, true, None), settings(true)]), @"Some((SettingsAndDocumentOperation { settings_ids: [1, 3], method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0, 2] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), settings(true), doc_imp(UpdateDocuments, true, None), settings(true)]), @"Some((SettingsAndDocumentOperation { settings_ids: [1, 3], method: UpdateDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0, 2] }, true))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), settings(true), doc_imp(ReplaceDocuments, true, None), settings(true)]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), settings(true), doc_imp(UpdateDocuments, true, None), settings(true)]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); - // We ensure this kind of batch doesn't batch with forbidden operations - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), settings(true), doc_imp(UpdateDocuments, true, None)]), @"Some((SettingsAndDocumentOperation { settings_ids: [1], method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), settings(true), doc_imp(ReplaceDocuments, true, None)]), @"Some((SettingsAndDocumentOperation { settings_ids: [1], method: UpdateDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), settings(true), doc_del()]), @"Some((SettingsAndDocumentOperation { settings_ids: [1], method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), settings(true), doc_del()]), @"Some((SettingsAndDocumentOperation { settings_ids: [1], method: UpdateDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), settings(true), idx_create()]), @"Some((SettingsAndDocumentOperation { settings_ids: [1], method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), settings(true), idx_create()]), @"Some((SettingsAndDocumentOperation { settings_ids: [1], method: UpdateDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), settings(true), idx_update()]), @"Some((SettingsAndDocumentOperation { settings_ids: [1], method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), settings(true), idx_update()]), @"Some((SettingsAndDocumentOperation { settings_ids: [1], method: UpdateDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), settings(true), idx_swap()]), @"Some((SettingsAndDocumentOperation { settings_ids: [1], method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), settings(true), idx_swap()]), @"Some((SettingsAndDocumentOperation { settings_ids: [1], method: UpdateDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); + // Doesn't batch with other forbidden operations + debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), settings(true), doc_imp(UpdateDocuments, true, None)]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), settings(true), doc_imp(ReplaceDocuments, true, None)]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), settings(true), doc_del()]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), settings(true), doc_del()]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), settings(true), idx_create()]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), settings(true), idx_create()]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), settings(true), idx_update()]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), settings(true), idx_update()]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), settings(true), idx_swap()]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), settings(true), idx_swap()]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); } #[test] @@ -859,8 +785,8 @@ mod tests { debug_snapshot!(autobatch_from(true, None, [doc_clr(), settings(true)]), @"Some((DocumentClear { ids: [0] }, false))"); debug_snapshot!(autobatch_from(true, None, [settings(true), doc_clr(), settings(true)]), @"Some((ClearAndSettings { other: [1], allow_index_creation: true, settings_ids: [0, 2] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), settings(true), doc_clr()]), @"Some((ClearAndSettings { other: [0, 2], allow_index_creation: true, settings_ids: [1] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), settings(true), doc_clr()]), @"Some((ClearAndSettings { other: [0, 2], allow_index_creation: true, settings_ids: [1] }, true))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), settings(true), doc_clr()]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), settings(true), doc_clr()]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); } #[test] @@ -907,50 +833,6 @@ mod tests { debug_snapshot!(autobatch_from(false,None, [doc_clr(), idx_del()]), @"Some((IndexDeletion { ids: [0, 1] }, false))"); debug_snapshot!(autobatch_from(false,None, [settings(true), idx_del()]), @"Some((IndexDeletion { ids: [0, 1] }, true))"); debug_snapshot!(autobatch_from(false,None, [settings(false), idx_del()]), @"Some((IndexDeletion { ids: [0, 1] }, false))"); - - // Then the mixed cases. - // The index already exists, whatever is the right of the tasks it shouldn't change the result. - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), settings(true), idx_del()]), @"Some((IndexDeletion { ids: [0, 2, 1] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), settings(true), idx_del()]), @"Some((IndexDeletion { ids: [0, 2, 1] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), settings(true), doc_clr(), idx_del()]), @"Some((IndexDeletion { ids: [1, 3, 0, 2] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), settings(true), doc_clr(), idx_del()]), @"Some((IndexDeletion { ids: [1, 3, 0, 2] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments,false, None), settings(false), idx_del()]), @"Some((IndexDeletion { ids: [0, 2, 1] }, false))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, false, None), settings(false), idx_del()]), @"Some((IndexDeletion { ids: [0, 2, 1] }, false))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments,false, None), settings(false), doc_clr(), idx_del()]), @"Some((IndexDeletion { ids: [1, 3, 0, 2] }, false))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, false, None), settings(false), doc_clr(), idx_del()]), @"Some((IndexDeletion { ids: [1, 3, 0, 2] }, false))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments,false, None), settings(true), idx_del()]), @"Some((IndexDeletion { ids: [0, 2, 1] }, false))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, false, None), settings(true), idx_del()]), @"Some((IndexDeletion { ids: [0, 2, 1] }, false))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments,false, None), settings(true), doc_clr(), idx_del()]), @"Some((IndexDeletion { ids: [1, 3, 0, 2] }, false))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, false, None), settings(true), doc_clr(), idx_del()]), @"Some((IndexDeletion { ids: [1, 3, 0, 2] }, false))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments,true, None), settings(false), idx_del()]), @"Some((IndexDeletion { ids: [0, 2, 1] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), settings(false), idx_del()]), @"Some((IndexDeletion { ids: [0, 2, 1] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments,true, None), settings(false), doc_clr(), idx_del()]), @"Some((IndexDeletion { ids: [1, 3, 0, 2] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), settings(false), doc_clr(), idx_del()]), @"Some((IndexDeletion { ids: [1, 3, 0, 2] }, true))"); - - // When the index doesn't exists yet it's more complicated. - // Either the first task we encounter create it, in which case we can create a big batch with everything. - debug_snapshot!(autobatch_from(false,None, [doc_imp(ReplaceDocuments, true, None), settings(true), idx_del()]), @"Some((IndexDeletion { ids: [0, 2, 1] }, true))"); - debug_snapshot!(autobatch_from(false,None, [doc_imp(UpdateDocuments, true, None), settings(true), idx_del()]), @"Some((IndexDeletion { ids: [0, 2, 1] }, true))"); - debug_snapshot!(autobatch_from(false,None, [doc_imp(ReplaceDocuments, true, None), settings(true), doc_clr(), idx_del()]), @"Some((IndexDeletion { ids: [1, 3, 0, 2] }, true))"); - debug_snapshot!(autobatch_from(false,None, [doc_imp(UpdateDocuments, true, None), settings(true), doc_clr(), idx_del()]), @"Some((IndexDeletion { ids: [1, 3, 0, 2] }, true))"); - // The right of the tasks following isn't really important. - debug_snapshot!(autobatch_from(false,None, [doc_imp(ReplaceDocuments,true, None), settings(false), idx_del()]), @"Some((IndexDeletion { ids: [0, 2, 1] }, true))"); - debug_snapshot!(autobatch_from(false,None, [doc_imp(UpdateDocuments, true, None), settings(false), idx_del()]), @"Some((IndexDeletion { ids: [0, 2, 1] }, true))"); - debug_snapshot!(autobatch_from(false,None, [doc_imp(ReplaceDocuments,true, None), settings(false), doc_clr(), idx_del()]), @"Some((IndexDeletion { ids: [1, 3, 0, 2] }, true))"); - debug_snapshot!(autobatch_from(false,None, [doc_imp(UpdateDocuments, true, None), settings(false), doc_clr(), idx_del()]), @"Some((IndexDeletion { ids: [1, 3, 0, 2] }, true))"); - // Or, the second case; the first task doesn't create the index and thus we wants to batch it with only tasks that can't create an index. - // that can be a second task that don't have the right to create an index. Or anything that can't create an index like an index deletion, document deletion, document clear, etc. - // All theses tasks are going to throw an error `Index doesn't exist` once the batch is processed. - debug_snapshot!(autobatch_from(false,None, [doc_imp(ReplaceDocuments,false, None), settings(false), idx_del()]), @"Some((IndexDeletion { ids: [0, 2, 1] }, false))"); - debug_snapshot!(autobatch_from(false,None, [doc_imp(UpdateDocuments, false, None), settings(false), idx_del()]), @"Some((IndexDeletion { ids: [0, 2, 1] }, false))"); - debug_snapshot!(autobatch_from(false,None, [doc_imp(ReplaceDocuments,false, None), settings(false), doc_clr(), idx_del()]), @"Some((IndexDeletion { ids: [1, 3, 0, 2] }, false))"); - debug_snapshot!(autobatch_from(false,None, [doc_imp(UpdateDocuments, false, None), settings(false), doc_clr(), idx_del()]), @"Some((IndexDeletion { ids: [1, 3, 0, 2] }, false))"); - // The third and final case is when the first task doesn't create an index but is directly followed by a task creating an index. In this case we can't batch whit what - // follows because we first need to process the erronous batch. - debug_snapshot!(autobatch_from(false,None, [doc_imp(ReplaceDocuments,false, None), settings(true), idx_del()]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0] }, false))"); - debug_snapshot!(autobatch_from(false,None, [doc_imp(UpdateDocuments, false, None), settings(true), idx_del()]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0] }, false))"); - debug_snapshot!(autobatch_from(false,None, [doc_imp(ReplaceDocuments,false, None), settings(true), doc_clr(), idx_del()]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0] }, false))"); - debug_snapshot!(autobatch_from(false,None, [doc_imp(UpdateDocuments, false, None), settings(true), doc_clr(), idx_del()]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0] }, false))"); } #[test] @@ -959,13 +841,13 @@ mod tests { debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, false, None), doc_imp(ReplaceDocuments, true, None)]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0, 1] }, false))"); debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), doc_imp(ReplaceDocuments, true, None)]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0, 1] }, true))"); debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, false, None), doc_imp(ReplaceDocuments, false, None)]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0, 1] }, false))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), settings(true)]), @"Some((SettingsAndDocumentOperation { settings_ids: [1], method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, false, None), settings(true)]), @"Some((SettingsAndDocumentOperation { settings_ids: [1], method: ReplaceDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0] }, false))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), settings(true)]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, false, None), settings(true)]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0] }, false))"); debug_snapshot!(autobatch_from(false,None, [doc_imp(ReplaceDocuments, false, None), doc_imp(ReplaceDocuments, true, None)]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0] }, false))"); debug_snapshot!(autobatch_from(false,None, [doc_imp(ReplaceDocuments, true, None), doc_imp(ReplaceDocuments, true, None)]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0, 1] }, true))"); debug_snapshot!(autobatch_from(false,None, [doc_imp(ReplaceDocuments, false, None), doc_imp(ReplaceDocuments, false, None)]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0, 1] }, false))"); - debug_snapshot!(autobatch_from(false,None, [doc_imp(ReplaceDocuments, true, None), settings(true)]), @"Some((SettingsAndDocumentOperation { settings_ids: [1], method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); + debug_snapshot!(autobatch_from(false,None, [doc_imp(ReplaceDocuments, true, None), settings(true)]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); debug_snapshot!(autobatch_from(false,None, [doc_imp(ReplaceDocuments, false, None), settings(true)]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0] }, false))"); // batch deletion and addition diff --git a/crates/index-scheduler/src/batch.rs b/crates/index-scheduler/src/batch.rs index 630471790..a40eac02c 100644 --- a/crates/index-scheduler/src/batch.rs +++ b/crates/index-scheduler/src/batch.rs @@ -22,27 +22,26 @@ use std::ffi::OsStr; use std::fmt; use std::fs::{self, File}; use std::io::BufWriter; -use std::sync::atomic::{self, AtomicU64}; -use std::time::Duration; +use std::sync::atomic::Ordering; use bumpalo::collections::CollectIn; use bumpalo::Bump; use dump::IndexMetadata; use meilisearch_types::batches::BatchId; -use meilisearch_types::error::Code; use meilisearch_types::heed::{RoTxn, RwTxn}; use meilisearch_types::milli::documents::{obkv_to_object, DocumentsBatchReader, PrimaryKey}; use meilisearch_types::milli::heed::CompactionOption; +use meilisearch_types::milli::progress::Progress; use meilisearch_types::milli::update::new::indexer::{self, UpdateByFunction}; -use meilisearch_types::milli::update::{IndexDocumentsMethod, Settings as MilliSettings}; +use meilisearch_types::milli::update::{ + DocumentAdditionResult, IndexDocumentsMethod, Settings as MilliSettings, +}; use meilisearch_types::milli::vector::parsed_vectors::{ ExplicitVectors, VectorOrArrayOfVectors, RESERVED_VECTORS_FIELD_NAME, }; use meilisearch_types::milli::{self, Filter, ThreadPoolNoAbortBuilder}; use meilisearch_types::settings::{apply_settings_to_builder, Settings, Unchecked}; -use meilisearch_types::tasks::{ - Details, IndexSwap, Kind, KindWithContent, Status, Task, TaskProgress, -}; +use meilisearch_types::tasks::{Details, IndexSwap, Kind, KindWithContent, Status, Task}; use meilisearch_types::{compression, Index, VERSION_FILE_NAME}; use roaring::RoaringBitmap; use time::macros::format_description; @@ -50,6 +49,13 @@ use time::OffsetDateTime; use uuid::Uuid; use crate::autobatcher::{self, BatchKind}; +use crate::processing::{ + AtomicBatchStep, AtomicDocumentStep, AtomicTaskStep, AtomicUpdateFileStep, CreateIndexProgress, + DeleteIndexProgress, DocumentDeletionProgress, DocumentEditionProgress, + DocumentOperationProgress, DumpCreationProgress, InnerSwappingTwoIndexes, SettingsProgress, + SnapshotCreationProgress, SwappingTheIndexes, TaskCancelationProgress, TaskDeletionProgress, + UpdateIndexProgress, VariableNameStep, +}; use crate::utils::{self, swap_index_uid_in_task, ProcessingBatch}; use crate::{Error, IndexScheduler, Result, TaskId}; @@ -104,7 +110,6 @@ pub(crate) enum IndexOperation { index_uid: String, primary_key: Option, method: IndexDocumentsMethod, - documents_counts: Vec, operations: Vec, tasks: Vec, }, @@ -130,19 +135,6 @@ pub(crate) enum IndexOperation { index_uid: String, cleared_tasks: Vec, - // The boolean indicates if it's a settings deletion or creation. - settings: Vec<(bool, Settings)>, - settings_tasks: Vec, - }, - SettingsAndDocumentOperation { - index_uid: String, - - primary_key: Option, - method: IndexDocumentsMethod, - documents_counts: Vec, - operations: Vec, - document_import_tasks: Vec, - // The boolean indicates if it's a settings deletion or creation. settings: Vec<(bool, Settings)>, settings_tasks: Vec, @@ -174,12 +166,7 @@ impl Batch { IndexOperation::DocumentEdition { task, .. } => { RoaringBitmap::from_sorted_iter(std::iter::once(task.uid)).unwrap() } - IndexOperation::SettingsAndDocumentOperation { - document_import_tasks: tasks, - settings_tasks: other, - .. - } - | IndexOperation::DocumentClearAndSetting { + IndexOperation::DocumentClearAndSetting { cleared_tasks: tasks, settings_tasks: other, .. @@ -239,8 +226,7 @@ impl IndexOperation { | IndexOperation::DocumentDeletion { index_uid, .. } | IndexOperation::DocumentClear { index_uid, .. } | IndexOperation::Settings { index_uid, .. } - | IndexOperation::DocumentClearAndSetting { index_uid, .. } - | IndexOperation::SettingsAndDocumentOperation { index_uid, .. } => index_uid, + | IndexOperation::DocumentClearAndSetting { index_uid, .. } => index_uid, } } } @@ -262,9 +248,6 @@ impl fmt::Display for IndexOperation { IndexOperation::DocumentClearAndSetting { .. } => { f.write_str("IndexOperation::DocumentClearAndSetting") } - IndexOperation::SettingsAndDocumentOperation { .. } => { - f.write_str("IndexOperation::SettingsAndDocumentOperation") - } } } } @@ -330,21 +313,14 @@ impl IndexScheduler { }) .flatten(); - let mut documents_counts = Vec::new(); let mut operations = Vec::new(); for task in tasks.iter() { match task.kind { - KindWithContent::DocumentAdditionOrUpdate { - content_file, - documents_count, - .. - } => { - documents_counts.push(documents_count); + KindWithContent::DocumentAdditionOrUpdate { content_file, .. } => { operations.push(DocumentOperation::Add(content_file)); } KindWithContent::DocumentDeletion { ref documents_ids, .. } => { - documents_counts.push(documents_ids.len() as u64); operations.push(DocumentOperation::Delete(documents_ids.clone())); } _ => unreachable!(), @@ -356,7 +332,6 @@ impl IndexScheduler { index_uid, primary_key, method, - documents_counts, operations, tasks, }, @@ -441,67 +416,6 @@ impl IndexScheduler { must_create_index, })) } - BatchKind::SettingsAndDocumentOperation { - settings_ids, - method, - allow_index_creation, - primary_key, - operation_ids, - } => { - let settings = self.create_next_batch_index( - rtxn, - index_uid.clone(), - BatchKind::Settings { settings_ids, allow_index_creation }, - current_batch, - must_create_index, - )?; - - let document_import = self.create_next_batch_index( - rtxn, - index_uid.clone(), - BatchKind::DocumentOperation { - method, - allow_index_creation, - primary_key, - operation_ids, - }, - current_batch, - must_create_index, - )?; - - match (document_import, settings) { - ( - Some(Batch::IndexOperation { - op: - IndexOperation::DocumentOperation { - primary_key, - documents_counts, - operations, - tasks: document_import_tasks, - .. - }, - .. - }), - Some(Batch::IndexOperation { - op: IndexOperation::Settings { settings, tasks: settings_tasks, .. }, - .. - }), - ) => Ok(Some(Batch::IndexOperation { - op: IndexOperation::SettingsAndDocumentOperation { - index_uid, - primary_key, - method, - documents_counts, - operations, - document_import_tasks, - settings, - settings_tasks, - }, - must_create_index, - })), - _ => unreachable!(), - } - } BatchKind::IndexCreation { id } => { let mut task = self.get_task(rtxn, id)?.ok_or(Error::CorruptedTaskQueue)?; current_batch.processing(Some(&mut task)); @@ -589,7 +503,6 @@ impl IndexScheduler { // 5. We make a batch from the unprioritised tasks. Start by taking the next enqueued task. let task_id = if let Some(task_id) = enqueued.min() { task_id } else { return Ok(None) }; let mut task = self.get_task(rtxn, task_id)?.ok_or(Error::CorruptedTaskQueue)?; - current_batch.processing(Some(&mut task)); // If the task is not associated with any index, verify that it is an index swap and // create the batch directly. Otherwise, get the index name associated with the task @@ -599,6 +512,7 @@ impl IndexScheduler { index_name } else { assert!(matches!(&task.kind, KindWithContent::IndexSwap { swaps } if swaps.is_empty())); + current_batch.processing(Some(&mut task)); return Ok(Some((Batch::IndexSwap { task }, current_batch))); }; @@ -652,11 +566,12 @@ impl IndexScheduler { /// The list of tasks that were processed. The metadata of each task in the returned /// list is updated accordingly, with the exception of the its date fields /// [`finished_at`](meilisearch_types::tasks::Task::finished_at) and [`started_at`](meilisearch_types::tasks::Task::started_at). - #[tracing::instrument(level = "trace", skip(self, batch), target = "indexing::scheduler", fields(batch=batch.to_string()))] + #[tracing::instrument(level = "trace", skip(self, batch, progress), target = "indexing::scheduler", fields(batch=batch.to_string()))] pub(crate) fn process_batch( &self, batch: Batch, current_batch: &mut ProcessingBatch, + progress: Progress, ) -> Result> { #[cfg(test)] { @@ -676,8 +591,13 @@ impl IndexScheduler { }; let rtxn = self.env.read_txn()?; - let mut canceled_tasks = - self.cancel_matched_tasks(&rtxn, task.uid, current_batch, matched_tasks)?; + let mut canceled_tasks = self.cancel_matched_tasks( + &rtxn, + task.uid, + current_batch, + matched_tasks, + &progress, + )?; task.status = Status::Succeeded; match &mut task.details { @@ -708,7 +628,8 @@ impl IndexScheduler { } let mut wtxn = self.env.write_txn()?; - let mut deleted_tasks = self.delete_matched_tasks(&mut wtxn, &matched_tasks)?; + let mut deleted_tasks = + self.delete_matched_tasks(&mut wtxn, &matched_tasks, &progress)?; wtxn.commit()?; for task in tasks.iter_mut() { @@ -734,6 +655,8 @@ impl IndexScheduler { Ok(tasks) } Batch::SnapshotCreation(mut tasks) => { + progress.update_progress(SnapshotCreationProgress::StartTheSnapshotCreation); + fs::create_dir_all(&self.snapshots_path)?; let temp_snapshot_dir = tempfile::tempdir()?; @@ -754,6 +677,7 @@ impl IndexScheduler { // two read operations as the task processing is synchronous. // 2.1 First copy the LMDB env of the index-scheduler + progress.update_progress(SnapshotCreationProgress::SnapshotTheIndexScheduler); let dst = temp_snapshot_dir.path().join("tasks"); fs::create_dir_all(&dst)?; self.env.copy_to_file(dst.join("data.mdb"), CompactionOption::Enabled)?; @@ -766,27 +690,41 @@ impl IndexScheduler { fs::create_dir_all(&update_files_dir)?; // 2.4 Only copy the update files of the enqueued tasks - for task_id in self.get_status(&rtxn, Status::Enqueued)? { + progress.update_progress(SnapshotCreationProgress::SnapshotTheUpdateFiles); + let enqueued = self.get_status(&rtxn, Status::Enqueued)?; + let (atomic, update_file_progress) = + AtomicUpdateFileStep::new(enqueued.len() as u32); + progress.update_progress(update_file_progress); + for task_id in enqueued { let task = self.get_task(&rtxn, task_id)?.ok_or(Error::CorruptedTaskQueue)?; if let Some(content_uuid) = task.content_uuid() { let src = self.file_store.get_update_path(content_uuid); let dst = update_files_dir.join(content_uuid.to_string()); fs::copy(src, dst)?; } + atomic.fetch_add(1, Ordering::Relaxed); } // 3. Snapshot every indexes - for result in self.index_mapper.index_mapping.iter(&rtxn)? { + progress.update_progress(SnapshotCreationProgress::SnapshotTheIndexes); + let index_mapping = self.index_mapper.index_mapping; + let nb_indexes = index_mapping.len(&rtxn)? as u32; + + for (i, result) in index_mapping.iter(&rtxn)?.enumerate() { let (name, uuid) = result?; + progress.update_progress(VariableNameStep::new(name, i as u32, nb_indexes)); let index = self.index_mapper.index(&rtxn, name)?; let dst = temp_snapshot_dir.path().join("indexes").join(uuid.to_string()); fs::create_dir_all(&dst)?; - index.copy_to_file(dst.join("data.mdb"), CompactionOption::Enabled)?; + index + .copy_to_file(dst.join("data.mdb"), CompactionOption::Enabled) + .map_err(|e| Error::from_milli(e, Some(name.to_string())))?; } drop(rtxn); // 4. Snapshot the auth LMDB env + progress.update_progress(SnapshotCreationProgress::SnapshotTheApiKeys); let dst = temp_snapshot_dir.path().join("auth"); fs::create_dir_all(&dst)?; // TODO We can't use the open_auth_store_env function here but we should @@ -799,6 +737,7 @@ impl IndexScheduler { auth.copy_to_file(dst.join("data.mdb"), CompactionOption::Enabled)?; // 5. Copy and tarball the flat snapshot + progress.update_progress(SnapshotCreationProgress::CreateTheTarball); // 5.1 Find the original name of the database // TODO find a better way to get this path let mut base_path = self.env.path().to_owned(); @@ -831,6 +770,7 @@ impl IndexScheduler { Ok(tasks) } Batch::Dump(mut task) => { + progress.update_progress(DumpCreationProgress::StartTheDumpCreation); let started_at = OffsetDateTime::now_utc(); let (keys, instance_uid) = if let KindWithContent::DumpCreation { keys, instance_uid } = &task.kind { @@ -841,6 +781,7 @@ impl IndexScheduler { let dump = dump::DumpWriter::new(*instance_uid)?; // 1. dump the keys + progress.update_progress(DumpCreationProgress::DumpTheApiKeys); let mut dump_keys = dump.create_keys()?; for key in keys { dump_keys.push_key(key)?; @@ -850,7 +791,13 @@ impl IndexScheduler { let rtxn = self.env.read_txn()?; // 2. dump the tasks + progress.update_progress(DumpCreationProgress::DumpTheTasks); let mut dump_tasks = dump.create_tasks_queue()?; + + let (atomic, update_task_progress) = + AtomicTaskStep::new(self.all_tasks.len(&rtxn)? as u32); + progress.update_progress(update_task_progress); + for ret in self.all_tasks.iter(&rtxn)? { if self.must_stop_processing.get() { return Err(Error::AbortedTask); @@ -883,50 +830,84 @@ impl IndexScheduler { let content_file = self.file_store.get_update(content_file)?; let reader = DocumentsBatchReader::from_reader(content_file) - .map_err(milli::Error::from)?; + .map_err(|e| Error::from_milli(e.into(), None))?; let (mut cursor, documents_batch_index) = reader.into_cursor_and_fields_index(); - while let Some(doc) = - cursor.next_document().map_err(milli::Error::from)? + while let Some(doc) = cursor + .next_document() + .map_err(|e| Error::from_milli(e.into(), None))? { - dump_content_file - .push_document(&obkv_to_object(doc, &documents_batch_index)?)?; + dump_content_file.push_document( + &obkv_to_object(doc, &documents_batch_index) + .map_err(|e| Error::from_milli(e, None))?, + )?; } dump_content_file.flush()?; } } + atomic.fetch_add(1, Ordering::Relaxed); } dump_tasks.flush()?; // 3. Dump the indexes + progress.update_progress(DumpCreationProgress::DumpTheIndexes); + let nb_indexes = self.index_mapper.index_mapping.len(&rtxn)? as u32; + let mut count = 0; self.index_mapper.try_for_each_index(&rtxn, |uid, index| -> Result<()> { + progress.update_progress(VariableNameStep::new( + uid.to_string(), + count, + nb_indexes, + )); + count += 1; + let rtxn = index.read_txn()?; let metadata = IndexMetadata { uid: uid.to_owned(), primary_key: index.primary_key(&rtxn)?.map(String::from), - created_at: index.created_at(&rtxn)?, - updated_at: index.updated_at(&rtxn)?, + created_at: index + .created_at(&rtxn) + .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?, + updated_at: index + .updated_at(&rtxn) + .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?, }; let mut index_dumper = dump.create_index(uid, &metadata)?; let fields_ids_map = index.fields_ids_map(&rtxn)?; let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect(); - let embedding_configs = index.embedding_configs(&rtxn)?; + let embedding_configs = index + .embedding_configs(&rtxn) + .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; + let nb_documents = index + .number_of_documents(&rtxn) + .map_err(|e| Error::from_milli(e, Some(uid.to_string())))? + as u32; + let (atomic, update_document_progress) = AtomicDocumentStep::new(nb_documents); + progress.update_progress(update_document_progress); + let documents = index + .all_documents(&rtxn) + .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; // 3.1. Dump the documents - for ret in index.all_documents(&rtxn)? { + for ret in documents { if self.must_stop_processing.get() { return Err(Error::AbortedTask); } - let (id, doc) = ret?; + let (id, doc) = + ret.map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; - let mut document = milli::obkv_to_json(&all_fields, &fields_ids_map, doc)?; + let mut document = + milli::obkv_to_json(&all_fields, &fields_ids_map, doc) + .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; 'inject_vectors: { - let embeddings = index.embeddings(&rtxn, id)?; + let embeddings = index + .embeddings(&rtxn, id) + .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; if embeddings.is_empty() { break 'inject_vectors; @@ -937,7 +918,7 @@ impl IndexScheduler { .or_insert(serde_json::Value::Object(Default::default())); let serde_json::Value::Object(vectors) = vectors else { - return Err(milli::Error::UserError( + let user_err = milli::Error::UserError( milli::UserError::InvalidVectorsMapType { document_id: { if let Ok(Some(Ok(index))) = index @@ -951,8 +932,9 @@ impl IndexScheduler { }, value: vectors.clone(), }, - ) - .into()); + ); + + return Err(Error::from_milli(user_err, Some(uid.to_string()))); }; for (embedder_name, embeddings) in embeddings { @@ -975,6 +957,7 @@ impl IndexScheduler { } index_dumper.push_document(&document)?; + atomic.fetch_add(1, Ordering::Relaxed); } // 3.2. Dump the settings @@ -982,12 +965,14 @@ impl IndexScheduler { index, &rtxn, meilisearch_types::settings::SecretPolicy::RevealSecrets, - )?; + ) + .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; index_dumper.settings(&settings)?; Ok(()) })?; // 4. Dump experimental feature settings + progress.update_progress(DumpCreationProgress::DumpTheExperimentalFeatures); let features = self.features().runtime_features(); dump.create_experimental_features(features)?; @@ -998,6 +983,7 @@ impl IndexScheduler { if self.must_stop_processing.get() { return Err(Error::AbortedTask); } + progress.update_progress(DumpCreationProgress::CompressTheDump); let path = self.dumps_path.join(format!("{}.dump", dump_uid)); let file = File::create(path)?; dump.persist_to(BufWriter::new(file))?; @@ -1023,8 +1009,14 @@ impl IndexScheduler { .set_currently_updating_index(Some((index_uid.clone(), index.clone()))); let mut index_wtxn = index.write_txn()?; - let tasks = self.apply_index_operation(&mut index_wtxn, &index, op)?; - index_wtxn.commit()?; + let tasks = self.apply_index_operation(&mut index_wtxn, &index, op, progress)?; + + { + let span = tracing::trace_span!(target: "indexing::scheduler", "commit"); + let _entered = span.enter(); + + index_wtxn.commit()?; + } // if the update processed successfully, we're going to store the new // stats of the index. Since the tasks have already been processed and @@ -1032,7 +1024,8 @@ impl IndexScheduler { // the entire batch. let res = || -> Result<()> { let index_rtxn = index.read_txn()?; - let stats = crate::index_mapper::IndexStats::new(&index, &index_rtxn)?; + let stats = crate::index_mapper::IndexStats::new(&index, &index_rtxn) + .map_err(|e| Error::from_milli(e, Some(index_uid.to_string())))?; let mut wtxn = self.env.write_txn()?; self.index_mapper.store_stats_of(&mut wtxn, &index_uid, &stats)?; wtxn.commit()?; @@ -1050,6 +1043,8 @@ impl IndexScheduler { Ok(tasks) } Batch::IndexCreation { index_uid, primary_key, task } => { + progress.update_progress(CreateIndexProgress::CreatingTheIndex); + let wtxn = self.env.write_txn()?; if self.index_mapper.exists(&wtxn, &index_uid)? { return Err(Error::IndexAlreadyExists(index_uid)); @@ -1059,9 +1054,11 @@ impl IndexScheduler { self.process_batch( Batch::IndexUpdate { index_uid, primary_key, task }, current_batch, + progress, ) } Batch::IndexUpdate { index_uid, primary_key, mut task } => { + progress.update_progress(UpdateIndexProgress::UpdatingTheIndex); let rtxn = self.env.read_txn()?; let index = self.index_mapper.index(&rtxn, &index_uid)?; @@ -1074,10 +1071,12 @@ impl IndexScheduler { ); builder.set_primary_key(primary_key); let must_stop_processing = self.must_stop_processing.clone(); - builder.execute( - |indexing_step| tracing::debug!(update = ?indexing_step), - || must_stop_processing.get(), - )?; + builder + .execute( + |indexing_step| tracing::debug!(update = ?indexing_step), + || must_stop_processing.get(), + ) + .map_err(|e| Error::from_milli(e, Some(index_uid.to_string())))?; index_wtxn.commit()?; } @@ -1094,7 +1093,8 @@ impl IndexScheduler { let res = || -> Result<()> { let mut wtxn = self.env.write_txn()?; let index_rtxn = index.read_txn()?; - let stats = crate::index_mapper::IndexStats::new(&index, &index_rtxn)?; + let stats = crate::index_mapper::IndexStats::new(&index, &index_rtxn) + .map_err(|e| Error::from_milli(e, Some(index_uid.clone())))?; self.index_mapper.store_stats_of(&mut wtxn, &index_uid, &stats)?; wtxn.commit()?; Ok(()) @@ -1111,13 +1111,16 @@ impl IndexScheduler { Ok(vec![task]) } Batch::IndexDeletion { index_uid, index_has_been_created, mut tasks } => { + progress.update_progress(DeleteIndexProgress::DeletingTheIndex); let wtxn = self.env.write_txn()?; // it's possible that the index doesn't exist let number_of_documents = || -> Result { let index = self.index_mapper.index(&wtxn, &index_uid)?; let index_rtxn = index.read_txn()?; - Ok(index.number_of_documents(&index_rtxn)?) + index + .number_of_documents(&index_rtxn) + .map_err(|e| Error::from_milli(e, Some(index_uid.to_string()))) }() .unwrap_or_default(); @@ -1142,6 +1145,8 @@ impl IndexScheduler { Ok(tasks) } Batch::IndexSwap { mut task } => { + progress.update_progress(SwappingTheIndexes::EnsuringCorrectnessOfTheSwap); + let mut wtxn = self.env.write_txn()?; let swaps = if let KindWithContent::IndexSwap { swaps } = &task.kind { swaps @@ -1168,8 +1173,20 @@ impl IndexScheduler { )); } } - for swap in swaps { - self.apply_index_swap(&mut wtxn, task.uid, &swap.indexes.0, &swap.indexes.1)?; + progress.update_progress(SwappingTheIndexes::SwappingTheIndexes); + for (step, swap) in swaps.iter().enumerate() { + progress.update_progress(VariableNameStep::new( + format!("swapping index {} and {}", swap.indexes.0, swap.indexes.1), + step as u32, + swaps.len() as u32, + )); + self.apply_index_swap( + &mut wtxn, + &progress, + task.uid, + &swap.indexes.0, + &swap.indexes.1, + )?; } wtxn.commit()?; task.status = Status::Succeeded; @@ -1179,7 +1196,15 @@ impl IndexScheduler { } /// Swap the index `lhs` with the index `rhs`. - fn apply_index_swap(&self, wtxn: &mut RwTxn, task_id: u32, lhs: &str, rhs: &str) -> Result<()> { + fn apply_index_swap( + &self, + wtxn: &mut RwTxn, + progress: &Progress, + task_id: u32, + lhs: &str, + rhs: &str, + ) -> Result<()> { + progress.update_progress(InnerSwappingTwoIndexes::RetrieveTheTasks); // 1. Verify that both lhs and rhs are existing indexes let index_lhs_exists = self.index_mapper.index_exists(wtxn, lhs)?; if !index_lhs_exists { @@ -1197,14 +1222,21 @@ impl IndexScheduler { index_rhs_task_ids.remove_range(task_id..); // 3. before_name -> new_name in the task's KindWithContent - for task_id in &index_lhs_task_ids | &index_rhs_task_ids { + progress.update_progress(InnerSwappingTwoIndexes::UpdateTheTasks); + let tasks_to_update = &index_lhs_task_ids | &index_rhs_task_ids; + let (atomic, task_progress) = AtomicTaskStep::new(tasks_to_update.len() as u32); + progress.update_progress(task_progress); + + for task_id in tasks_to_update { let mut task = self.get_task(wtxn, task_id)?.ok_or(Error::CorruptedTaskQueue)?; swap_index_uid_in_task(&mut task, (lhs, rhs)); self.all_tasks.put(wtxn, &task_id, &task)?; + atomic.fetch_add(1, Ordering::Relaxed); } // 4. remove the task from indexuid = before_name // 5. add the task to indexuid = after_name + progress.update_progress(InnerSwappingTwoIndexes::UpdateTheIndexesMetadata); self.update_index(wtxn, lhs, |lhs_tasks| { *lhs_tasks -= &index_lhs_task_ids; *lhs_tasks |= &index_rhs_task_ids; @@ -1226,7 +1258,7 @@ impl IndexScheduler { /// The list of processed tasks. #[tracing::instrument( level = "trace", - skip(self, index_wtxn, index), + skip(self, index_wtxn, index, progress), target = "indexing::scheduler" )] fn apply_index_operation<'i>( @@ -1234,48 +1266,18 @@ impl IndexScheduler { index_wtxn: &mut RwTxn<'i>, index: &'i Index, operation: IndexOperation, + progress: Progress, ) -> Result> { let indexer_alloc = Bump::new(); let started_processing_at = std::time::Instant::now(); - let secs_since_started_processing_at = AtomicU64::new(0); - const PRINT_SECS_DELTA: u64 = 5; - - let processing_tasks = self.processing_tasks.clone(); let must_stop_processing = self.must_stop_processing.clone(); - let send_progress = |progress| { - let now = std::time::Instant::now(); - let elapsed = secs_since_started_processing_at.load(atomic::Ordering::Relaxed); - let previous = started_processing_at + Duration::from_secs(elapsed); - let elapsed = now - previous; - - if elapsed.as_secs() < PRINT_SECS_DELTA { - return; - } - - secs_since_started_processing_at - .store((now - started_processing_at).as_secs(), atomic::Ordering::Relaxed); - - let TaskProgress { - current_step, - finished_steps, - total_steps, - finished_substeps, - total_substeps, - } = processing_tasks.write().unwrap().update_progress(progress); - - tracing::info!( - current_step, - finished_steps, - total_steps, - finished_substeps, - total_substeps - ); - }; match operation { - IndexOperation::DocumentClear { mut tasks, .. } => { - let count = milli::update::ClearDocuments::new(index_wtxn, index).execute()?; + IndexOperation::DocumentClear { index_uid, mut tasks } => { + let count = milli::update::ClearDocuments::new(index_wtxn, index) + .execute() + .map_err(|e| Error::from_milli(e, Some(index_uid)))?; let mut first_clear_found = false; for task in &mut tasks { @@ -1295,13 +1297,13 @@ impl IndexScheduler { Ok(tasks) } IndexOperation::DocumentOperation { - index_uid: _, + index_uid, primary_key, method, - documents_counts: _, operations, mut tasks, } => { + progress.update_progress(DocumentOperationProgress::RetrievingConfig); // TODO: at some point, for better efficiency we might want to reuse the bumpalo for successive batches. // this is made difficult by the fact we're doing private clones of the index scheduler and sending it // to a fresh thread. @@ -1322,13 +1324,17 @@ impl IndexScheduler { let mut content_files_iter = content_files.iter(); let mut indexer = indexer::DocumentOperation::new(method); - let embedders = index.embedding_configs(index_wtxn)?; - let embedders = self.embedders(embedders)?; + let embedders = index + .embedding_configs(index_wtxn) + .map_err(|e| Error::from_milli(e, Some(index_uid.clone())))?; + let embedders = self.embedders(index_uid.clone(), embedders)?; for operation in operations { match operation { DocumentOperation::Add(_content_uuid) => { let mmap = content_files_iter.next().unwrap(); - indexer.add_documents(mmap)?; + indexer + .add_documents(mmap) + .map_err(|e| Error::from_milli(e, Some(index_uid.clone())))?; } DocumentOperation::Delete(document_ids) => { let document_ids: bumpalo::collections::vec::Vec<_> = document_ids @@ -1345,24 +1351,30 @@ impl IndexScheduler { let pool = match &indexer_config.thread_pool { Some(pool) => pool, None => { - local_pool = ThreadPoolNoAbortBuilder::new().build().unwrap(); + local_pool = ThreadPoolNoAbortBuilder::new() + .thread_name(|i| format!("indexing-thread-{i}")) + .build() + .unwrap(); &local_pool } }; - let (document_changes, operation_stats, primary_key) = indexer.into_changes( - &indexer_alloc, - index, - &rtxn, - primary_key.as_deref(), - &mut new_fields_ids_map, - &|| must_stop_processing.get(), - &send_progress, - )?; + progress.update_progress(DocumentOperationProgress::ComputingDocumentChanges); + let (document_changes, operation_stats, primary_key) = indexer + .into_changes( + &indexer_alloc, + index, + &rtxn, + primary_key.as_deref(), + &mut new_fields_ids_map, + &|| must_stop_processing.get(), + progress.clone(), + ) + .map_err(|e| Error::from_milli(e, Some(index_uid.clone())))?; - let mut addition = 0; + let mut candidates_count = 0; for (stats, task) in operation_stats.into_iter().zip(&mut tasks) { - addition += stats.document_count; + candidates_count += stats.document_count; match stats.error { Some(error) => { task.status = Status::Failed; @@ -1392,29 +1404,38 @@ impl IndexScheduler { } } + progress.update_progress(DocumentOperationProgress::Indexing); if tasks.iter().any(|res| res.error.is_none()) { - pool.install(|| { - indexer::index( - index_wtxn, - index, - indexer_config.grenad_parameters(), - &db_fields_ids_map, - new_fields_ids_map, - primary_key, - &document_changes, - embedders, - &|| must_stop_processing.get(), - &send_progress, - ) - }) - .unwrap()?; + indexer::index( + index_wtxn, + index, + pool, + indexer_config.grenad_parameters(), + &db_fields_ids_map, + new_fields_ids_map, + primary_key, + &document_changes, + embedders, + &|| must_stop_processing.get(), + &progress, + ) + .map_err(|e| Error::from_milli(e, Some(index_uid.clone())))?; + + let addition = DocumentAdditionResult { + indexed_documents: candidates_count, + number_of_documents: index + .number_of_documents(index_wtxn) + .map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?, + }; tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done"); } Ok(tasks) } - IndexOperation::DocumentEdition { mut task, .. } => { + IndexOperation::DocumentEdition { index_uid, mut task } => { + progress.update_progress(DocumentEditionProgress::RetrievingConfig); + let (filter, code) = if let KindWithContent::DocumentEdition { filter_expr, context: _, @@ -1428,16 +1449,11 @@ impl IndexScheduler { }; let candidates = match filter.as_ref().map(Filter::from_json) { - Some(Ok(Some(filter))) => { - filter.evaluate(index_wtxn, index).map_err(|err| match err { - milli::Error::UserError(milli::UserError::InvalidFilter(_)) => { - Error::from(err).with_custom_error_code(Code::InvalidDocumentFilter) - } - e => e.into(), - })? - } + Some(Ok(Some(filter))) => filter + .evaluate(index_wtxn, index) + .map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?, None | Some(Ok(None)) => index.documents_ids(index_wtxn)?, - Some(Err(e)) => return Err(e.into()), + Some(Err(e)) => return Err(Error::from_milli(e, Some(index_uid.clone()))), }; let (original_filter, context, function) = if let Some(Details::DocumentEdition { @@ -1472,8 +1488,9 @@ impl IndexScheduler { // candidates not empty => index not empty => a primary key is set let primary_key = index.primary_key(&rtxn)?.unwrap(); - let primary_key = PrimaryKey::new_or_insert(primary_key, &mut new_fields_ids_map) - .map_err(milli::Error::from)?; + let primary_key = + PrimaryKey::new_or_insert(primary_key, &mut new_fields_ids_map) + .map_err(|err| Error::from_milli(err.into(), Some(index_uid.clone())))?; let result_count = Ok((candidates.len(), candidates.len())) as Result<_>; @@ -1483,36 +1500,53 @@ impl IndexScheduler { let pool = match &indexer_config.thread_pool { Some(pool) => pool, None => { - local_pool = ThreadPoolNoAbortBuilder::new().build().unwrap(); + local_pool = ThreadPoolNoAbortBuilder::new() + .thread_name(|i| format!("indexing-thread-{i}")) + .build() + .unwrap(); &local_pool } }; - pool.install(|| { - let indexer = - UpdateByFunction::new(candidates, context.clone(), code.clone()); - let document_changes = indexer.into_changes(&primary_key)?; - let embedders = index.embedding_configs(index_wtxn)?; - let embedders = self.embedders(embedders)?; + let candidates_count = candidates.len(); + progress.update_progress(DocumentEditionProgress::ComputingDocumentChanges); + let indexer = UpdateByFunction::new(candidates, context.clone(), code.clone()); + let document_changes = pool + .install(|| { + indexer + .into_changes(&primary_key) + .map_err(|err| Error::from_milli(err, Some(index_uid.clone()))) + }) + .unwrap()?; + let embedders = index + .embedding_configs(index_wtxn) + .map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?; + let embedders = self.embedders(index_uid.clone(), embedders)?; - indexer::index( - index_wtxn, - index, - indexer_config.grenad_parameters(), - &db_fields_ids_map, - new_fields_ids_map, - None, // cannot change primary key in DocumentEdition - &document_changes, - embedders, - &|| must_stop_processing.get(), - &send_progress, - )?; + progress.update_progress(DocumentEditionProgress::Indexing); + indexer::index( + index_wtxn, + index, + pool, + indexer_config.grenad_parameters(), + &db_fields_ids_map, + new_fields_ids_map, + None, // cannot change primary key in DocumentEdition + &document_changes, + embedders, + &|| must_stop_processing.get(), + &progress, + ) + .map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?; - Result::Ok(()) - }) - .unwrap()?; + let addition = DocumentAdditionResult { + indexed_documents: candidates_count, + number_of_documents: index + .number_of_documents(index_wtxn) + .map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?, + }; - // tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done"); + tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done"); } match result_count { @@ -1541,7 +1575,9 @@ impl IndexScheduler { Ok(vec![task]) } - IndexOperation::DocumentDeletion { mut tasks, index_uid: _ } => { + IndexOperation::DocumentDeletion { mut tasks, index_uid } => { + progress.update_progress(DocumentDeletionProgress::RetrievingConfig); + let mut to_delete = RoaringBitmap::new(); let external_documents_ids = index.external_documents_ids(); @@ -1562,35 +1598,23 @@ impl IndexScheduler { deleted_documents: Some(will_be_removed), }); } - KindWithContent::DocumentDeletionByFilter { index_uid: _, filter_expr } => { + KindWithContent::DocumentDeletionByFilter { index_uid, filter_expr } => { let before = to_delete.len(); let filter = match Filter::from_json(filter_expr) { Ok(filter) => filter, Err(err) => { // theorically, this should be catched by deserr before reaching the index-scheduler and cannot happens task.status = Status::Failed; - task.error = match err { - milli::Error::UserError( - milli::UserError::InvalidFilterExpression { .. }, - ) => Some( - Error::from(err) - .with_custom_error_code(Code::InvalidDocumentFilter) - .into(), - ), - e => Some(e.into()), - }; + task.error = Some( + Error::from_milli(err, Some(index_uid.clone())).into(), + ); None } }; if let Some(filter) = filter { - let candidates = - filter.evaluate(index_wtxn, index).map_err(|err| match err { - milli::Error::UserError( - milli::UserError::InvalidFilter(_), - ) => Error::from(err) - .with_custom_error_code(Code::InvalidDocumentFilter), - e => e.into(), - }); + let candidates = filter + .evaluate(index_wtxn, index) + .map_err(|err| Error::from_milli(err, Some(index_uid.clone()))); match candidates { Ok(candidates) => to_delete |= candidates, Err(err) => { @@ -1626,8 +1650,9 @@ impl IndexScheduler { // to_delete not empty => index not empty => primary key set let primary_key = index.primary_key(&rtxn)?.unwrap(); - let primary_key = PrimaryKey::new_or_insert(primary_key, &mut new_fields_ids_map) - .map_err(milli::Error::from)?; + let primary_key = + PrimaryKey::new_or_insert(primary_key, &mut new_fields_ids_map) + .map_err(|err| Error::from_milli(err.into(), Some(index_uid.clone())))?; if !tasks.iter().all(|res| res.error.is_some()) { let local_pool; @@ -1635,39 +1660,54 @@ impl IndexScheduler { let pool = match &indexer_config.thread_pool { Some(pool) => pool, None => { - local_pool = ThreadPoolNoAbortBuilder::new().build().unwrap(); + local_pool = ThreadPoolNoAbortBuilder::new() + .thread_name(|i| format!("indexing-thread-{i}")) + .build() + .unwrap(); &local_pool } }; + progress.update_progress(DocumentDeletionProgress::DeleteDocuments); let mut indexer = indexer::DocumentDeletion::new(); + let candidates_count = to_delete.len(); indexer.delete_documents_by_docids(to_delete); let document_changes = indexer.into_changes(&indexer_alloc, primary_key); - let embedders = index.embedding_configs(index_wtxn)?; - let embedders = self.embedders(embedders)?; + let embedders = index + .embedding_configs(index_wtxn) + .map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?; + let embedders = self.embedders(index_uid.clone(), embedders)?; - pool.install(|| { - indexer::index( - index_wtxn, - index, - indexer_config.grenad_parameters(), - &db_fields_ids_map, - new_fields_ids_map, - None, // document deletion never changes primary key - &document_changes, - embedders, - &|| must_stop_processing.get(), - &send_progress, - ) - }) - .unwrap()?; + progress.update_progress(DocumentDeletionProgress::Indexing); + indexer::index( + index_wtxn, + index, + pool, + indexer_config.grenad_parameters(), + &db_fields_ids_map, + new_fields_ids_map, + None, // document deletion never changes primary key + &document_changes, + embedders, + &|| must_stop_processing.get(), + &progress, + ) + .map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?; - // tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done"); + let addition = DocumentAdditionResult { + indexed_documents: candidates_count, + number_of_documents: index + .number_of_documents(index_wtxn) + .map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?, + }; + + tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done"); } Ok(tasks) } - IndexOperation::Settings { index_uid: _, settings, mut tasks } => { + IndexOperation::Settings { index_uid, settings, mut tasks } => { + progress.update_progress(SettingsProgress::RetrievingAndMergingTheSettings); let indexer_config = self.index_mapper.indexer_config(); let mut builder = milli::update::Settings::new(index_wtxn, index, indexer_config); @@ -1681,50 +1721,16 @@ impl IndexScheduler { task.status = Status::Succeeded; } - builder.execute( - |indexing_step| tracing::debug!(update = ?indexing_step), - || must_stop_processing.get(), - )?; + progress.update_progress(SettingsProgress::ApplyTheSettings); + builder + .execute( + |indexing_step| tracing::debug!(update = ?indexing_step), + || must_stop_processing.get(), + ) + .map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?; Ok(tasks) } - IndexOperation::SettingsAndDocumentOperation { - index_uid, - primary_key, - method, - documents_counts, - operations, - document_import_tasks, - settings, - settings_tasks, - } => { - let settings_tasks = self.apply_index_operation( - index_wtxn, - index, - IndexOperation::Settings { - index_uid: index_uid.clone(), - settings, - tasks: settings_tasks, - }, - )?; - - let mut import_tasks = self.apply_index_operation( - index_wtxn, - index, - IndexOperation::DocumentOperation { - index_uid, - primary_key, - method, - documents_counts, - operations, - tasks: document_import_tasks, - }, - )?; - - let mut tasks = settings_tasks; - tasks.append(&mut import_tasks); - Ok(tasks) - } IndexOperation::DocumentClearAndSetting { index_uid, cleared_tasks, @@ -1738,12 +1744,14 @@ impl IndexScheduler { index_uid: index_uid.clone(), tasks: cleared_tasks, }, + progress.clone(), )?; let settings_tasks = self.apply_index_operation( index_wtxn, index, IndexOperation::Settings { index_uid, settings, tasks: settings_tasks }, + progress, )?; let mut tasks = settings_tasks; @@ -1760,15 +1768,18 @@ impl IndexScheduler { &self, wtxn: &mut RwTxn, matched_tasks: &RoaringBitmap, + progress: &Progress, ) -> Result { + progress.update_progress(TaskDeletionProgress::DeletingTasksDateTime); + // 1. Remove from this list the tasks that we are not allowed to delete let enqueued_tasks = self.get_status(wtxn, Status::Enqueued)?; let processing_tasks = &self.processing_tasks.read().unwrap().processing.clone(); let all_task_ids = self.all_task_ids(wtxn)?; let mut to_delete_tasks = all_task_ids & matched_tasks; - to_delete_tasks -= processing_tasks; - to_delete_tasks -= enqueued_tasks; + to_delete_tasks -= &**processing_tasks; + to_delete_tasks -= &enqueued_tasks; // 2. We now have a list of tasks to delete, delete them @@ -1779,6 +1790,8 @@ impl IndexScheduler { // The tasks that have been removed *per batches*. let mut affected_batches: HashMap = HashMap::new(); + let (atomic_progress, task_progress) = AtomicTaskStep::new(to_delete_tasks.len() as u32); + progress.update_progress(task_progress); for task_id in to_delete_tasks.iter() { let task = self.get_task(wtxn, task_id)?.ok_or(Error::CorruptedTaskQueue)?; @@ -1802,22 +1815,35 @@ impl IndexScheduler { if let Some(batch_uid) = task.batch_uid { affected_batches.entry(batch_uid).or_default().insert(task_id); } + atomic_progress.fetch_add(1, Ordering::Relaxed); } + progress.update_progress(TaskDeletionProgress::DeletingTasksMetadata); + let (atomic_progress, task_progress) = AtomicTaskStep::new( + (affected_indexes.len() + affected_statuses.len() + affected_kinds.len()) as u32, + ); + progress.update_progress(task_progress); for index in affected_indexes.iter() { self.update_index(wtxn, index, |bitmap| *bitmap -= &to_delete_tasks)?; + atomic_progress.fetch_add(1, Ordering::Relaxed); } for status in affected_statuses.iter() { self.update_status(wtxn, *status, |bitmap| *bitmap -= &to_delete_tasks)?; + atomic_progress.fetch_add(1, Ordering::Relaxed); } for kind in affected_kinds.iter() { self.update_kind(wtxn, *kind, |bitmap| *bitmap -= &to_delete_tasks)?; + atomic_progress.fetch_add(1, Ordering::Relaxed); } + progress.update_progress(TaskDeletionProgress::DeletingTasks); + let (atomic_progress, task_progress) = AtomicTaskStep::new(to_delete_tasks.len() as u32); + progress.update_progress(task_progress); for task in to_delete_tasks.iter() { self.all_tasks.delete(wtxn, &task)?; + atomic_progress.fetch_add(1, Ordering::Relaxed); } for canceled_by in affected_canceled_by { if let Some(mut tasks) = self.canceled_by.get(wtxn, &canceled_by)? { @@ -1829,6 +1855,9 @@ impl IndexScheduler { } } } + progress.update_progress(TaskDeletionProgress::DeletingBatches); + let (atomic_progress, batch_progress) = AtomicBatchStep::new(affected_batches.len() as u32); + progress.update_progress(batch_progress); for (batch_id, to_delete_tasks) in affected_batches { if let Some(mut tasks) = self.batch_to_tasks_mapping.get(wtxn, &batch_id)? { tasks -= &to_delete_tasks; @@ -1870,6 +1899,7 @@ impl IndexScheduler { } } } + atomic_progress.fetch_add(1, Ordering::Relaxed); } Ok(to_delete_tasks) @@ -1884,21 +1914,36 @@ impl IndexScheduler { cancel_task_id: TaskId, current_batch: &mut ProcessingBatch, matched_tasks: &RoaringBitmap, + progress: &Progress, ) -> Result> { + progress.update_progress(TaskCancelationProgress::RetrievingTasks); + // 1. Remove from this list the tasks that we are not allowed to cancel // Notice that only the _enqueued_ ones are cancelable and we should // have already aborted the indexation of the _processing_ ones let cancelable_tasks = self.get_status(rtxn, Status::Enqueued)?; let tasks_to_cancel = cancelable_tasks & matched_tasks; - // 2. We now have a list of tasks to cancel, cancel them - let mut tasks = self.get_existing_tasks(rtxn, tasks_to_cancel.iter())?; + let (task_progress, progress_obj) = AtomicTaskStep::new(tasks_to_cancel.len() as u32); + progress.update_progress(progress_obj); + // 2. We now have a list of tasks to cancel, cancel them + let mut tasks = self.get_existing_tasks( + rtxn, + tasks_to_cancel.iter().inspect(|_| { + task_progress.fetch_add(1, Ordering::Relaxed); + }), + )?; + + progress.update_progress(TaskCancelationProgress::UpdatingTasks); + let (task_progress, progress_obj) = AtomicTaskStep::new(tasks_to_cancel.len() as u32); + progress.update_progress(progress_obj); for task in tasks.iter_mut() { task.status = Status::Canceled; task.canceled_by = Some(cancel_task_id); task.details = task.details.as_ref().map(|d| d.to_failed()); current_batch.processing(Some(task)); + task_progress.fetch_add(1, Ordering::Relaxed); } Ok(tasks) diff --git a/crates/index-scheduler/src/error.rs b/crates/index-scheduler/src/error.rs index f6a4ecc04..f6ee1f685 100644 --- a/crates/index-scheduler/src/error.rs +++ b/crates/index-scheduler/src/error.rs @@ -104,7 +104,7 @@ pub enum Error { )] InvalidTaskCanceledBy { canceled_by: String }, #[error( - "{index_uid} is not a valid index uid. Index uid can be an integer or a string containing only alphanumeric characters, hyphens (-) and underscores (_), and can not be more than 512 bytes." + "{index_uid} is not a valid index uid. Index uid can be an integer or a string containing only alphanumeric characters, hyphens (-) and underscores (_), and can not be more than 400 bytes." )] InvalidIndexUid { index_uid: String }, #[error("Task `{0}` not found.")] @@ -122,8 +122,11 @@ pub enum Error { Dump(#[from] dump::Error), #[error(transparent)] Heed(#[from] heed::Error), - #[error(transparent)] - Milli(#[from] milli::Error), + #[error("{}", match .index_uid { + Some(uid) if !uid.is_empty() => format!("Index `{}`: {error}", uid), + _ => format!("{error}") + })] + Milli { error: milli::Error, index_uid: Option }, #[error("An unexpected crash occurred when processing the task.")] ProcessBatchPanicked, #[error(transparent)] @@ -190,7 +193,7 @@ impl Error { | Error::AbortedTask | Error::Dump(_) | Error::Heed(_) - | Error::Milli(_) + | Error::Milli { .. } | Error::ProcessBatchPanicked | Error::FileStore(_) | Error::IoError(_) @@ -209,6 +212,20 @@ impl Error { pub fn with_custom_error_code(self, code: Code) -> Self { Self::WithCustomErrorCode(code, Box::new(self)) } + + pub fn from_milli(err: milli::Error, index_uid: Option) -> Self { + match err { + milli::Error::UserError(milli::UserError::InvalidFilter(_)) => { + Self::Milli { error: err, index_uid } + .with_custom_error_code(Code::InvalidDocumentFilter) + } + milli::Error::UserError(milli::UserError::InvalidFilterExpression { .. }) => { + Self::Milli { error: err, index_uid } + .with_custom_error_code(Code::InvalidDocumentFilter) + } + _ => Self::Milli { error: err, index_uid }, + } + } } impl ErrorCode for Error { @@ -236,7 +253,7 @@ impl ErrorCode for Error { // TODO: not sure of the Code to use Error::NoSpaceLeftInTaskQueue => Code::NoSpaceLeftOnDevice, Error::Dump(e) => e.error_code(), - Error::Milli(e) => e.error_code(), + Error::Milli { error, .. } => error.error_code(), Error::ProcessBatchPanicked => Code::Internal, Error::Heed(e) => e.error_code(), Error::HeedTransaction(e) => e.error_code(), diff --git a/crates/index-scheduler/src/index_mapper/index_map.rs b/crates/index-scheduler/src/index_mapper/index_map.rs index f8080d23b..480dafa7c 100644 --- a/crates/index-scheduler/src/index_mapper/index_map.rs +++ b/crates/index-scheduler/src/index_mapper/index_map.rs @@ -3,14 +3,13 @@ use std::path::Path; use std::time::Duration; use meilisearch_types::heed::{EnvClosingEvent, EnvFlags, EnvOpenOptions}; -use meilisearch_types::milli::Index; +use meilisearch_types::milli::{Index, Result}; use time::OffsetDateTime; use uuid::Uuid; use super::IndexStatus::{self, Available, BeingDeleted, Closing, Missing}; +use crate::clamp_to_page_size; use crate::lru::{InsertionOutcome, LruMap}; -use crate::{clamp_to_page_size, Result}; - /// Keep an internally consistent view of the open indexes in memory. /// /// This view is made of an LRU cache that will evict the least frequently used indexes when new indexes are opened. diff --git a/crates/index-scheduler/src/index_mapper/mod.rs b/crates/index-scheduler/src/index_mapper/mod.rs index 3cccb5a69..2f5b176ed 100644 --- a/crates/index-scheduler/src/index_mapper/mod.rs +++ b/crates/index-scheduler/src/index_mapper/mod.rs @@ -5,6 +5,7 @@ use std::{fs, thread}; use meilisearch_types::heed::types::{SerdeJson, Str}; use meilisearch_types::heed::{Database, Env, RoTxn, RwTxn}; +use meilisearch_types::milli; use meilisearch_types::milli::update::IndexerConfig; use meilisearch_types::milli::{FieldDistribution, Index}; use serde::{Deserialize, Serialize}; @@ -121,7 +122,7 @@ impl IndexStats { /// # Parameters /// /// - rtxn: a RO transaction for the index, obtained from `Index::read_txn()`. - pub fn new(index: &Index, rtxn: &RoTxn) -> Result { + pub fn new(index: &Index, rtxn: &RoTxn) -> milli::Result { Ok(IndexStats { number_of_documents: index.number_of_documents(rtxn)?, database_size: index.on_disk_size()?, @@ -183,13 +184,18 @@ impl IndexMapper { // Error if the UUIDv4 somehow already exists in the map, since it should be fresh. // This is very unlikely to happen in practice. // TODO: it would be better to lazily create the index. But we need an Index::open function for milli. - let index = self.index_map.write().unwrap().create( - &uuid, - &index_path, - date, - self.enable_mdb_writemap, - self.index_base_map_size, - )?; + let index = self + .index_map + .write() + .unwrap() + .create( + &uuid, + &index_path, + date, + self.enable_mdb_writemap, + self.index_base_map_size, + ) + .map_err(|e| Error::from_milli(e, Some(uuid.to_string())))?; wtxn.commit()?; @@ -357,7 +363,9 @@ impl IndexMapper { }; let index_path = self.base_path.join(uuid.to_string()); // take the lock to reopen the environment. - reopen.reopen(&mut self.index_map.write().unwrap(), &index_path)?; + reopen + .reopen(&mut self.index_map.write().unwrap(), &index_path) + .map_err(|e| Error::from_milli(e, Some(uuid.to_string())))?; continue; } BeingDeleted => return Err(Error::IndexNotFound(name.to_string())), @@ -372,13 +380,15 @@ impl IndexMapper { Missing => { let index_path = self.base_path.join(uuid.to_string()); - break index_map.create( - &uuid, - &index_path, - None, - self.enable_mdb_writemap, - self.index_base_map_size, - )?; + break index_map + .create( + &uuid, + &index_path, + None, + self.enable_mdb_writemap, + self.index_base_map_size, + ) + .map_err(|e| Error::from_milli(e, Some(uuid.to_string())))?; } Available(index) => break index, Closing(_) => { @@ -460,6 +470,7 @@ impl IndexMapper { let index = self.index(rtxn, index_uid)?; let index_rtxn = index.read_txn()?; IndexStats::new(&index, &index_rtxn) + .map_err(|e| Error::from_milli(e, Some(uuid.to_string()))) } } } diff --git a/crates/index-scheduler/src/insta_snapshot.rs b/crates/index-scheduler/src/insta_snapshot.rs index bcd5966b5..67627d8c1 100644 --- a/crates/index-scheduler/src/insta_snapshot.rs +++ b/crates/index-scheduler/src/insta_snapshot.rs @@ -353,7 +353,7 @@ pub fn snapshot_canceled_by(rtxn: &RoTxn, db: Database String { let mut snap = String::new(); - let Batch { uid, details, stats, started_at, finished_at } = batch; + let Batch { uid, details, stats, started_at, finished_at, progress: _ } = batch; if let Some(finished_at) = finished_at { assert!(finished_at > started_at); } diff --git a/crates/index-scheduler/src/lib.rs b/crates/index-scheduler/src/lib.rs index cef24c1ea..f5f73087d 100644 --- a/crates/index-scheduler/src/lib.rs +++ b/crates/index-scheduler/src/lib.rs @@ -26,6 +26,7 @@ mod index_mapper; #[cfg(test)] mod insta_snapshot; mod lru; +mod processing; mod utils; pub mod uuid_codec; @@ -56,12 +57,12 @@ use meilisearch_types::heed::types::{SerdeBincode, SerdeJson, Str, I128}; use meilisearch_types::heed::{self, Database, Env, PutFlags, RoTxn, RwTxn}; use meilisearch_types::milli::documents::DocumentsBatchBuilder; use meilisearch_types::milli::index::IndexEmbeddingConfig; -use meilisearch_types::milli::update::new::indexer::document_changes::Progress; use meilisearch_types::milli::update::IndexerConfig; use meilisearch_types::milli::vector::{Embedder, EmbedderOptions, EmbeddingConfigs}; use meilisearch_types::milli::{self, CboRoaringBitmapCodec, Index, RoaringBitmapCodec, BEU32}; use meilisearch_types::task_view::TaskView; -use meilisearch_types::tasks::{Kind, KindWithContent, Status, Task, TaskProgress}; +use meilisearch_types::tasks::{Kind, KindWithContent, Status, Task}; +use processing::ProcessingTasks; use rayon::current_num_threads; use rayon::prelude::{IntoParallelIterator, ParallelIterator}; use roaring::RoaringBitmap; @@ -72,7 +73,8 @@ use utils::{filter_out_references_to_newer_tasks, keep_ids_within_datetimes, map use uuid::Uuid; use crate::index_mapper::IndexMapper; -use crate::utils::{check_index_swap_validity, clamp_to_page_size, ProcessingBatch}; +use crate::processing::{AtomicTaskStep, BatchProgress}; +use crate::utils::{check_index_swap_validity, clamp_to_page_size}; pub(crate) type BEI128 = I128; @@ -163,48 +165,6 @@ impl Query { } } -#[derive(Debug, Clone)] -pub struct ProcessingTasks { - batch: Option, - /// The list of tasks ids that are currently running. - processing: RoaringBitmap, - /// The progress on processing tasks - progress: Option, -} - -impl ProcessingTasks { - /// Creates an empty `ProcessingAt` struct. - fn new() -> ProcessingTasks { - ProcessingTasks { batch: None, processing: RoaringBitmap::new(), progress: None } - } - - /// Stores the currently processing tasks, and the date time at which it started. - fn start_processing(&mut self, processing_batch: ProcessingBatch, processing: RoaringBitmap) { - self.batch = Some(processing_batch); - self.processing = processing; - } - - fn update_progress(&mut self, progress: Progress) -> TaskProgress { - self.progress.get_or_insert_with(TaskProgress::default).update(progress) - } - - /// Set the processing tasks to an empty list - fn stop_processing(&mut self) -> Self { - self.progress = None; - - Self { - batch: std::mem::take(&mut self.batch), - processing: std::mem::take(&mut self.processing), - progress: None, - } - } - - /// Returns `true` if there, at least, is one task that is currently processing that we must stop. - fn must_cancel_processing_tasks(&self, canceled_tasks: &RoaringBitmap) -> bool { - !self.processing.is_disjoint(canceled_tasks) - } -} - #[derive(Default, Clone, Debug)] struct MustStopProcessing(Arc); @@ -407,7 +367,7 @@ pub struct IndexScheduler { /// /// See [self.breakpoint()](`IndexScheduler::breakpoint`) for an explanation. #[cfg(test)] - test_breakpoint_sdr: crossbeam::channel::Sender<(Breakpoint, bool)>, + test_breakpoint_sdr: crossbeam_channel::Sender<(Breakpoint, bool)>, /// A list of planned failures within the [`tick`](IndexScheduler::tick) method of the index scheduler. /// @@ -476,7 +436,7 @@ impl IndexScheduler { /// Create an index scheduler and start its run loop. pub fn new( options: IndexSchedulerOptions, - #[cfg(test)] test_breakpoint_sdr: crossbeam::channel::Sender<(Breakpoint, bool)>, + #[cfg(test)] test_breakpoint_sdr: crossbeam_channel::Sender<(Breakpoint, bool)>, #[cfg(test)] planned_failures: Vec<(usize, tests::FailureLocation)>, ) -> Result { std::fs::create_dir_all(&options.tasks_path)?; @@ -813,7 +773,7 @@ impl IndexScheduler { let mut batch_tasks = RoaringBitmap::new(); for batch_uid in batch_uids { if processing_batch.as_ref().map_or(false, |batch| batch.uid == *batch_uid) { - batch_tasks |= &processing_tasks; + batch_tasks |= &*processing_tasks; } else { batch_tasks |= self.tasks_in_batch(rtxn, *batch_uid)?; } @@ -827,13 +787,13 @@ impl IndexScheduler { match status { // special case for Processing tasks Status::Processing => { - status_tasks |= &processing_tasks; + status_tasks |= &*processing_tasks; } status => status_tasks |= &self.get_status(rtxn, *status)?, }; } if !status.contains(&Status::Processing) { - tasks -= &processing_tasks; + tasks -= &*processing_tasks; } tasks &= status_tasks; } @@ -882,7 +842,7 @@ impl IndexScheduler { // Once we have filtered the two subsets, we put them back together and assign it back to `tasks`. tasks = { let (mut filtered_non_processing_tasks, mut filtered_processing_tasks) = - (&tasks - &processing_tasks, &tasks & &processing_tasks); + (&tasks - &*processing_tasks, &tasks & &*processing_tasks); // special case for Processing tasks // A closure that clears the filtered_processing_tasks if their started_at date falls outside the given bounds @@ -1090,7 +1050,7 @@ impl IndexScheduler { // Once we have filtered the two subsets, we put them back together and assign it back to `batches`. batches = { let (mut filtered_non_processing_batches, mut filtered_processing_batches) = - (&batches - &processing.processing, &batches & &processing.processing); + (&batches - &*processing.processing, &batches & &*processing.processing); // special case for Processing batches // A closure that clears the filtered_processing_batches if their started_at date falls outside the given bounds @@ -1440,7 +1400,7 @@ impl IndexScheduler { // if the task doesn't delete anything and 50% of the task queue is full, we must refuse to enqueue the incomming task if !matches!(&kind, KindWithContent::TaskDeletion { tasks, .. } if !tasks.is_empty()) - && (self.env.non_free_pages_size()? * 100) / self.env.info().map_size as u64 > 50 + && (self.env.non_free_pages_size()? * 100) / self.env.info().map_size as u64 > 40 { return Err(Error::NoSpaceLeftInTaskQueue); } @@ -1606,7 +1566,8 @@ impl IndexScheduler { // We reset the must_stop flag to be sure that we don't stop processing tasks self.must_stop_processing.reset(); - self.processing_tasks + let progress = self + .processing_tasks .write() .unwrap() // We can clone the processing batch here because we don't want its modification to affect the view of the processing batches @@ -1619,11 +1580,12 @@ impl IndexScheduler { let res = { let cloned_index_scheduler = self.private_clone(); let processing_batch = &mut processing_batch; + let progress = progress.clone(); std::thread::scope(|s| { let handle = std::thread::Builder::new() .name(String::from("batch-operation")) .spawn_scoped(s, move || { - cloned_index_scheduler.process_batch(batch, processing_batch) + cloned_index_scheduler.process_batch(batch, processing_batch, progress) }) .unwrap(); handle.join().unwrap_or(Err(Error::ProcessBatchPanicked)) @@ -1636,6 +1598,7 @@ impl IndexScheduler { #[cfg(test)] self.maybe_fail(tests::FailureLocation::AcquiringWtxn)?; + progress.update_progress(BatchProgress::WritingTasksToDisk); processing_batch.finished(); let mut wtxn = self.env.write_txn().map_err(Error::HeedTransaction)?; let mut canceled = RoaringBitmap::new(); @@ -1645,12 +1608,15 @@ impl IndexScheduler { #[cfg(test)] self.breakpoint(Breakpoint::ProcessBatchSucceeded); + let (task_progress, task_progress_obj) = AtomicTaskStep::new(tasks.len() as u32); + progress.update_progress(task_progress_obj); let mut success = 0; let mut failure = 0; let mut canceled_by = None; #[allow(unused_variables)] for (i, mut task) in tasks.into_iter().enumerate() { + task_progress.fetch_add(1, Ordering::Relaxed); processing_batch.update(&mut task); if task.status == Status::Canceled { canceled.insert(task.uid); @@ -1678,9 +1644,10 @@ impl IndexScheduler { tracing::info!("A batch of tasks was successfully completed with {success} successful tasks and {failure} failed tasks."); } // If we have an abortion error we must stop the tick here and re-schedule tasks. - Err(Error::Milli(milli::Error::InternalError( - milli::InternalError::AbortedIndexation, - ))) + Err(Error::Milli { + error: milli::Error::InternalError(milli::InternalError::AbortedIndexation), + .. + }) | Err(Error::AbortedTask) => { #[cfg(test)] self.breakpoint(Breakpoint::AbortedIndexation); @@ -1699,9 +1666,10 @@ impl IndexScheduler { // 2. close the associated environment // 3. resize it // 4. re-schedule tasks - Err(Error::Milli(milli::Error::UserError( - milli::UserError::MaxDatabaseSizeReached, - ))) if index_uid.is_some() => { + Err(Error::Milli { + error: milli::Error::UserError(milli::UserError::MaxDatabaseSizeReached), + .. + }) if index_uid.is_some() => { // fixme: add index_uid to match to avoid the unwrap let index_uid = index_uid.unwrap(); // fixme: handle error more gracefully? not sure when this could happen @@ -1716,8 +1684,12 @@ impl IndexScheduler { Err(err) => { #[cfg(test)] self.breakpoint(Breakpoint::ProcessBatchFailed); + let (task_progress, task_progress_obj) = AtomicTaskStep::new(ids.len() as u32); + progress.update_progress(task_progress_obj); + let error: ResponseError = err.into(); for id in ids.iter() { + task_progress.fetch_add(1, Ordering::Relaxed); let mut task = self .get_task(&wtxn, id) .map_err(|e| Error::TaskDatabaseUpdate(Box::new(e)))? @@ -1738,11 +1710,8 @@ impl IndexScheduler { } } - self.processing_tasks.write().unwrap().stop_processing(); // We must re-add the canceled task so they're part of the same batch. - // processed.processing |= canceled; ids |= canceled; - self.write_batch(&mut wtxn, processing_batch, &ids)?; #[cfg(test)] @@ -1750,6 +1719,10 @@ impl IndexScheduler { wtxn.commit().map_err(Error::HeedTransaction)?; + // We should stop processing AFTER everything is processed and written to disk otherwise, a batch (which only lives in RAM) may appear in the processing task + // and then become « not found » for some time until the commit everything is written and the final commit is made. + self.processing_tasks.write().unwrap().stop_processing(); + // Once the tasks are committed, we should delete all the update files associated ASAP to avoid leaking files in case of a restart tracing::debug!("Deleting the update files"); @@ -1942,6 +1915,7 @@ impl IndexScheduler { // TODO: consider using a type alias or a struct embedder/template pub fn embedders( &self, + index_uid: String, embedding_configs: Vec, ) -> Result { let res: Result<_> = embedding_configs @@ -1952,8 +1926,12 @@ impl IndexScheduler { config: milli::vector::EmbeddingConfig { embedder_options, prompt, quantized }, .. }| { - let prompt = - Arc::new(prompt.try_into().map_err(meilisearch_types::milli::Error::from)?); + let prompt = Arc::new( + prompt + .try_into() + .map_err(meilisearch_types::milli::Error::from) + .map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?, + ); // optimistically return existing embedder { let embedders = self.embedders.read().unwrap(); @@ -1969,7 +1947,9 @@ impl IndexScheduler { let embedder = Arc::new( Embedder::new(embedder_options.clone()) .map_err(meilisearch_types::milli::vector::Error::from) - .map_err(meilisearch_types::milli::Error::from)?, + .map_err(|err| { + Error::from_milli(err.into(), Some(index_uid.clone())) + })?, ); { let mut embedders = self.embedders.write().unwrap(); @@ -2237,7 +2217,7 @@ mod tests { use std::time::Instant; use big_s::S; - use crossbeam::channel::RecvTimeoutError; + use crossbeam_channel::RecvTimeoutError; use file_store::File; use insta::assert_json_snapshot; use maplit::btreeset; @@ -2289,7 +2269,7 @@ mod tests { configuration: impl Fn(&mut IndexSchedulerOptions), ) -> (Self, IndexSchedulerHandle) { let tempdir = TempDir::new().unwrap(); - let (sender, receiver) = crossbeam::channel::bounded(0); + let (sender, receiver) = crossbeam_channel::bounded(0); let indexer_config = IndexerConfig { skip_index_budget: true, ..Default::default() }; @@ -2421,7 +2401,7 @@ mod tests { pub struct IndexSchedulerHandle { _tempdir: TempDir, index_scheduler: IndexScheduler, - test_breakpoint_rcv: crossbeam::channel::Receiver<(Breakpoint, bool)>, + test_breakpoint_rcv: crossbeam_channel::Receiver<(Breakpoint, bool)>, last_breakpoint: Breakpoint, } @@ -4318,10 +4298,35 @@ mod tests { let proc = index_scheduler.processing_tasks.read().unwrap().clone(); let query = Query { statuses: Some(vec![Status::Processing]), ..Default::default() }; - let (batches, _) = index_scheduler - .get_batch_ids_from_authorized_indexes(&rtxn, &proc, &query, &AuthFilter::default()) + let (mut batches, _) = index_scheduler + .get_batches_from_authorized_indexes(query.clone(), &AuthFilter::default()) .unwrap(); - snapshot!(snapshot_bitmap(&batches), @"[0,]"); // only the processing batch in the first tick + assert_eq!(batches.len(), 1); + batches[0].started_at = OffsetDateTime::UNIX_EPOCH; + // Insta cannot snapshot our batches because the batch stats contains an enum as key: https://github.com/mitsuhiko/insta/issues/689 + let batch = serde_json::to_string_pretty(&batches[0]).unwrap(); + snapshot!(batch, @r#" + { + "uid": 0, + "details": { + "primaryKey": "mouse" + }, + "stats": { + "totalNbTasks": 1, + "status": { + "processing": 1 + }, + "types": { + "indexCreation": 1 + }, + "indexUids": { + "catto": 1 + } + }, + "startedAt": "1970-01-01T00:00:00Z", + "finishedAt": null + } + "#); let query = Query { statuses: Some(vec![Status::Enqueued]), ..Default::default() }; let (batches, _) = index_scheduler @@ -6145,7 +6150,7 @@ mod tests { insta::assert_json_snapshot!(simple_hf_config.embedder_options); let simple_hf_name = name.clone(); - let configs = index_scheduler.embedders(configs).unwrap(); + let configs = index_scheduler.embedders("doggos".to_string(), configs).unwrap(); let (hf_embedder, _, _) = configs.get(&simple_hf_name).unwrap(); let beagle_embed = hf_embedder.embed_one(S("Intel the beagle best doggo"), None).unwrap(); diff --git a/crates/index-scheduler/src/processing.rs b/crates/index-scheduler/src/processing.rs new file mode 100644 index 000000000..aca654de9 --- /dev/null +++ b/crates/index-scheduler/src/processing.rs @@ -0,0 +1,316 @@ +use std::borrow::Cow; +use std::sync::Arc; + +use enum_iterator::Sequence; +use meilisearch_types::milli::progress::{AtomicSubStep, NamedStep, Progress, ProgressView, Step}; +use meilisearch_types::milli::{make_atomic_progress, make_enum_progress}; +use roaring::RoaringBitmap; + +use crate::utils::ProcessingBatch; + +#[derive(Clone)] +pub struct ProcessingTasks { + pub batch: Option>, + /// The list of tasks ids that are currently running. + pub processing: Arc, + /// The progress on processing tasks + pub progress: Option, +} + +impl ProcessingTasks { + /// Creates an empty `ProcessingAt` struct. + pub fn new() -> ProcessingTasks { + ProcessingTasks { batch: None, processing: Arc::new(RoaringBitmap::new()), progress: None } + } + + pub fn get_progress_view(&self) -> Option { + Some(self.progress.as_ref()?.as_progress_view()) + } + + /// Stores the currently processing tasks, and the date time at which it started. + pub fn start_processing( + &mut self, + processing_batch: ProcessingBatch, + processing: RoaringBitmap, + ) -> Progress { + self.batch = Some(Arc::new(processing_batch)); + self.processing = Arc::new(processing); + let progress = Progress::default(); + progress.update_progress(BatchProgress::ProcessingTasks); + self.progress = Some(progress.clone()); + + progress + } + + /// Set the processing tasks to an empty list + pub fn stop_processing(&mut self) -> Self { + self.progress = None; + + Self { + batch: std::mem::take(&mut self.batch), + processing: std::mem::take(&mut self.processing), + progress: None, + } + } + + /// Returns `true` if there, at least, is one task that is currently processing that we must stop. + pub fn must_cancel_processing_tasks(&self, canceled_tasks: &RoaringBitmap) -> bool { + !self.processing.is_disjoint(canceled_tasks) + } +} + +make_enum_progress! { + pub enum BatchProgress { + ProcessingTasks, + WritingTasksToDisk, + } +} + +make_enum_progress! { + pub enum TaskCancelationProgress { + RetrievingTasks, + UpdatingTasks, + } +} + +make_enum_progress! { + pub enum TaskDeletionProgress { + DeletingTasksDateTime, + DeletingTasksMetadata, + DeletingTasks, + DeletingBatches, + } +} + +make_enum_progress! { + pub enum SnapshotCreationProgress { + StartTheSnapshotCreation, + SnapshotTheIndexScheduler, + SnapshotTheUpdateFiles, + SnapshotTheIndexes, + SnapshotTheApiKeys, + CreateTheTarball, + } +} + +make_enum_progress! { + pub enum DumpCreationProgress { + StartTheDumpCreation, + DumpTheApiKeys, + DumpTheTasks, + DumpTheIndexes, + DumpTheExperimentalFeatures, + CompressTheDump, + } +} + +make_enum_progress! { + pub enum CreateIndexProgress { + CreatingTheIndex, + } +} + +make_enum_progress! { + pub enum UpdateIndexProgress { + UpdatingTheIndex, + } +} + +make_enum_progress! { + pub enum DeleteIndexProgress { + DeletingTheIndex, + } +} + +make_enum_progress! { + pub enum SwappingTheIndexes { + EnsuringCorrectnessOfTheSwap, + SwappingTheIndexes, + } +} + +make_enum_progress! { + pub enum InnerSwappingTwoIndexes { + RetrieveTheTasks, + UpdateTheTasks, + UpdateTheIndexesMetadata, + } +} + +make_enum_progress! { + pub enum DocumentOperationProgress { + RetrievingConfig, + ComputingDocumentChanges, + Indexing, + } +} + +make_enum_progress! { + pub enum DocumentEditionProgress { + RetrievingConfig, + ComputingDocumentChanges, + Indexing, + } +} + +make_enum_progress! { + pub enum DocumentDeletionProgress { + RetrievingConfig, + DeleteDocuments, + Indexing, + } +} + +make_enum_progress! { + pub enum SettingsProgress { + RetrievingAndMergingTheSettings, + ApplyTheSettings, + } +} + +make_atomic_progress!(Task alias AtomicTaskStep => "task" ); +make_atomic_progress!(Document alias AtomicDocumentStep => "document" ); +make_atomic_progress!(Batch alias AtomicBatchStep => "batch" ); +make_atomic_progress!(UpdateFile alias AtomicUpdateFileStep => "update file" ); + +pub struct VariableNameStep { + name: String, + current: u32, + total: u32, +} + +impl VariableNameStep { + pub fn new(name: impl Into, current: u32, total: u32) -> Self { + Self { name: name.into(), current, total } + } +} + +impl Step for VariableNameStep { + fn name(&self) -> Cow<'static, str> { + self.name.clone().into() + } + + fn current(&self) -> u32 { + self.current + } + + fn total(&self) -> u32 { + self.total + } +} + +#[cfg(test)] +mod test { + use std::sync::atomic::Ordering; + + use meili_snap::{json_string, snapshot}; + + use super::*; + + #[test] + fn one_level() { + let mut processing = ProcessingTasks::new(); + processing.start_processing(ProcessingBatch::new(0), RoaringBitmap::new()); + snapshot!(json_string!(processing.get_progress_view()), @r#" + { + "steps": [ + { + "currentStep": "processing tasks", + "finished": 0, + "total": 2 + } + ], + "percentage": 0.0 + } + "#); + processing.progress.as_ref().unwrap().update_progress(BatchProgress::WritingTasksToDisk); + snapshot!(json_string!(processing.get_progress_view()), @r#" + { + "steps": [ + { + "currentStep": "writing tasks to disk", + "finished": 1, + "total": 2 + } + ], + "percentage": 50.0 + } + "#); + } + + #[test] + fn task_progress() { + let mut processing = ProcessingTasks::new(); + processing.start_processing(ProcessingBatch::new(0), RoaringBitmap::new()); + let (atomic, tasks) = AtomicTaskStep::new(10); + processing.progress.as_ref().unwrap().update_progress(tasks); + snapshot!(json_string!(processing.get_progress_view()), @r#" + { + "steps": [ + { + "currentStep": "processing tasks", + "finished": 0, + "total": 2 + }, + { + "currentStep": "task", + "finished": 0, + "total": 10 + } + ], + "percentage": 0.0 + } + "#); + atomic.fetch_add(6, Ordering::Relaxed); + snapshot!(json_string!(processing.get_progress_view()), @r#" + { + "steps": [ + { + "currentStep": "processing tasks", + "finished": 0, + "total": 2 + }, + { + "currentStep": "task", + "finished": 6, + "total": 10 + } + ], + "percentage": 30.000002 + } + "#); + processing.progress.as_ref().unwrap().update_progress(BatchProgress::WritingTasksToDisk); + snapshot!(json_string!(processing.get_progress_view()), @r#" + { + "steps": [ + { + "currentStep": "writing tasks to disk", + "finished": 1, + "total": 2 + } + ], + "percentage": 50.0 + } + "#); + let (atomic, tasks) = AtomicTaskStep::new(5); + processing.progress.as_ref().unwrap().update_progress(tasks); + atomic.fetch_add(4, Ordering::Relaxed); + snapshot!(json_string!(processing.get_progress_view()), @r#" + { + "steps": [ + { + "currentStep": "writing tasks to disk", + "finished": 1, + "total": 2 + }, + { + "currentStep": "task", + "finished": 4, + "total": 5 + } + ], + "percentage": 90.0 + } + "#); + } +} diff --git a/crates/index-scheduler/src/snapshots/lib.rs/cancel_mix_of_tasks/aborted_indexation.snap b/crates/index-scheduler/src/snapshots/lib.rs/cancel_mix_of_tasks/aborted_indexation.snap index 9710c4911..b73714e36 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/cancel_mix_of_tasks/aborted_indexation.snap +++ b/crates/index-scheduler/src/snapshots/lib.rs/cancel_mix_of_tasks/aborted_indexation.snap @@ -5,7 +5,7 @@ snapshot_kind: text ### Autobatching Enabled = true ### Processing batch Some(1): [1,] -{uid: 1, details: {"receivedDocuments":2,"indexedDocuments":null}, stats: {"totalNbTasks":2,"status":{"enqueued":2},"types":{"documentAdditionOrUpdate":2},"indexUids":{"beavero":2}}, } +{uid: 1, details: {"receivedDocuments":1,"indexedDocuments":null}, stats: {"totalNbTasks":1,"status":{"processing":1},"types":{"documentAdditionOrUpdate":1},"indexUids":{"beavero":1}}, } ---------------------------------------------------------------------- ### All Tasks: 0 {uid: 0, batch_uid: 0, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "catto", primary_key: None, method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} diff --git a/crates/index-scheduler/src/snapshots/lib.rs/cancel_mix_of_tasks/processing_second_task_cancel_enqueued.snap b/crates/index-scheduler/src/snapshots/lib.rs/cancel_mix_of_tasks/processing_second_task_cancel_enqueued.snap index e70aa0850..c24c36313 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/cancel_mix_of_tasks/processing_second_task_cancel_enqueued.snap +++ b/crates/index-scheduler/src/snapshots/lib.rs/cancel_mix_of_tasks/processing_second_task_cancel_enqueued.snap @@ -5,7 +5,7 @@ snapshot_kind: text ### Autobatching Enabled = true ### Processing batch Some(1): [1,] -{uid: 1, details: {"receivedDocuments":2,"indexedDocuments":null}, stats: {"totalNbTasks":2,"status":{"enqueued":2},"types":{"documentAdditionOrUpdate":2},"indexUids":{"beavero":2}}, } +{uid: 1, details: {"receivedDocuments":1,"indexedDocuments":null}, stats: {"totalNbTasks":1,"status":{"processing":1},"types":{"documentAdditionOrUpdate":1},"indexUids":{"beavero":1}}, } ---------------------------------------------------------------------- ### All Tasks: 0 {uid: 0, batch_uid: 0, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "catto", primary_key: None, method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} diff --git a/crates/index-scheduler/src/snapshots/lib.rs/cancel_processing_dump/cancel_registered.snap b/crates/index-scheduler/src/snapshots/lib.rs/cancel_processing_dump/cancel_registered.snap index 55c7b3ed2..b9f33e598 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/cancel_processing_dump/cancel_registered.snap +++ b/crates/index-scheduler/src/snapshots/lib.rs/cancel_processing_dump/cancel_registered.snap @@ -5,7 +5,7 @@ snapshot_kind: text ### Autobatching Enabled = true ### Processing batch Some(0): [0,] -{uid: 0, details: {"dumpUid":null}, stats: {"totalNbTasks":1,"status":{"enqueued":1},"types":{"dumpCreation":1},"indexUids":{}}, } +{uid: 0, details: {"dumpUid":null}, stats: {"totalNbTasks":1,"status":{"processing":1},"types":{"dumpCreation":1},"indexUids":{}}, } ---------------------------------------------------------------------- ### All Tasks: 0 {uid: 0, status: enqueued, details: { dump_uid: None }, kind: DumpCreation { keys: [], instance_uid: None }} diff --git a/crates/index-scheduler/src/snapshots/lib.rs/cancel_processing_task/aborted_indexation.snap b/crates/index-scheduler/src/snapshots/lib.rs/cancel_processing_task/aborted_indexation.snap index 91b4deb22..0b9a0d709 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/cancel_processing_task/aborted_indexation.snap +++ b/crates/index-scheduler/src/snapshots/lib.rs/cancel_processing_task/aborted_indexation.snap @@ -5,7 +5,7 @@ snapshot_kind: text ### Autobatching Enabled = true ### Processing batch Some(0): [0,] -{uid: 0, details: {"receivedDocuments":2,"indexedDocuments":null}, stats: {"totalNbTasks":2,"status":{"enqueued":2},"types":{"documentAdditionOrUpdate":2},"indexUids":{"catto":2}}, } +{uid: 0, details: {"receivedDocuments":1,"indexedDocuments":null}, stats: {"totalNbTasks":1,"status":{"processing":1},"types":{"documentAdditionOrUpdate":1},"indexUids":{"catto":1}}, } ---------------------------------------------------------------------- ### All Tasks: 0 {uid: 0, status: enqueued, details: { received_documents: 1, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "catto", primary_key: None, method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} diff --git a/crates/index-scheduler/src/snapshots/lib.rs/cancel_processing_task/cancel_task_registered.snap b/crates/index-scheduler/src/snapshots/lib.rs/cancel_processing_task/cancel_task_registered.snap index 89e8c8c6f..fef6c20f6 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/cancel_processing_task/cancel_task_registered.snap +++ b/crates/index-scheduler/src/snapshots/lib.rs/cancel_processing_task/cancel_task_registered.snap @@ -5,7 +5,7 @@ snapshot_kind: text ### Autobatching Enabled = true ### Processing batch Some(0): [0,] -{uid: 0, details: {"receivedDocuments":2,"indexedDocuments":null}, stats: {"totalNbTasks":2,"status":{"enqueued":2},"types":{"documentAdditionOrUpdate":2},"indexUids":{"catto":2}}, } +{uid: 0, details: {"receivedDocuments":1,"indexedDocuments":null}, stats: {"totalNbTasks":1,"status":{"processing":1},"types":{"documentAdditionOrUpdate":1},"indexUids":{"catto":1}}, } ---------------------------------------------------------------------- ### All Tasks: 0 {uid: 0, status: enqueued, details: { received_documents: 1, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "catto", primary_key: None, method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} diff --git a/crates/index-scheduler/src/snapshots/lib.rs/cancel_processing_task/initial_task_processing.snap b/crates/index-scheduler/src/snapshots/lib.rs/cancel_processing_task/initial_task_processing.snap index 12e1b1283..3f45be007 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/cancel_processing_task/initial_task_processing.snap +++ b/crates/index-scheduler/src/snapshots/lib.rs/cancel_processing_task/initial_task_processing.snap @@ -5,7 +5,7 @@ snapshot_kind: text ### Autobatching Enabled = true ### Processing batch Some(0): [0,] -{uid: 0, details: {"receivedDocuments":2,"indexedDocuments":null}, stats: {"totalNbTasks":2,"status":{"enqueued":2},"types":{"documentAdditionOrUpdate":2},"indexUids":{"catto":2}}, } +{uid: 0, details: {"receivedDocuments":1,"indexedDocuments":null}, stats: {"totalNbTasks":1,"status":{"processing":1},"types":{"documentAdditionOrUpdate":1},"indexUids":{"catto":1}}, } ---------------------------------------------------------------------- ### All Tasks: 0 {uid: 0, status: enqueued, details: { received_documents: 1, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "catto", primary_key: None, method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} diff --git a/crates/index-scheduler/src/snapshots/lib.rs/document_addition/after_the_batch_creation.snap b/crates/index-scheduler/src/snapshots/lib.rs/document_addition/after_the_batch_creation.snap index f7eaa6df8..8beb49145 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/document_addition/after_the_batch_creation.snap +++ b/crates/index-scheduler/src/snapshots/lib.rs/document_addition/after_the_batch_creation.snap @@ -5,7 +5,7 @@ snapshot_kind: text ### Autobatching Enabled = true ### Processing batch Some(0): [0,] -{uid: 0, details: {"receivedDocuments":2,"indexedDocuments":null}, stats: {"totalNbTasks":2,"status":{"enqueued":2},"types":{"documentAdditionOrUpdate":2},"indexUids":{"doggos":2}}, } +{uid: 0, details: {"receivedDocuments":1,"indexedDocuments":null}, stats: {"totalNbTasks":1,"status":{"processing":1},"types":{"documentAdditionOrUpdate":1},"indexUids":{"doggos":1}}, } ---------------------------------------------------------------------- ### All Tasks: 0 {uid: 0, status: enqueued, details: { received_documents: 1, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} diff --git a/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_addition/document_addition_batch_created.snap b/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_addition/document_addition_batch_created.snap index f7eaa6df8..8beb49145 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_addition/document_addition_batch_created.snap +++ b/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_addition/document_addition_batch_created.snap @@ -5,7 +5,7 @@ snapshot_kind: text ### Autobatching Enabled = true ### Processing batch Some(0): [0,] -{uid: 0, details: {"receivedDocuments":2,"indexedDocuments":null}, stats: {"totalNbTasks":2,"status":{"enqueued":2},"types":{"documentAdditionOrUpdate":2},"indexUids":{"doggos":2}}, } +{uid: 0, details: {"receivedDocuments":1,"indexedDocuments":null}, stats: {"totalNbTasks":1,"status":{"processing":1},"types":{"documentAdditionOrUpdate":1},"indexUids":{"doggos":1}}, } ---------------------------------------------------------------------- ### All Tasks: 0 {uid: 0, status: enqueued, details: { received_documents: 1, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} diff --git a/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_deletion/after_adding_the_documents.snap b/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_deletion/after_adding_the_documents.snap index 8d175e388..bda90680f 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_deletion/after_adding_the_documents.snap +++ b/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_deletion/after_adding_the_documents.snap @@ -1,13 +1,12 @@ --- source: crates/index-scheduler/src/lib.rs -snapshot_kind: text --- ### Autobatching Enabled = true ### Processing batch None: [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set({"catto"}), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set({"catto"}), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set({"catto"}), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set({"catto"}), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} 1 {uid: 1, batch_uid: 1, status: succeeded, details: { received_documents: 3, indexed_documents: Some(3) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 3, allow_index_creation: true }} ---------------------------------------------------------------------- ### Status: diff --git a/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_deletion/after_adding_the_settings.snap b/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_deletion/after_adding_the_settings.snap index d1de7ec61..be79abf21 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_deletion/after_adding_the_settings.snap +++ b/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_deletion/after_adding_the_settings.snap @@ -1,13 +1,12 @@ --- source: crates/index-scheduler/src/lib.rs -snapshot_kind: text --- ### Autobatching Enabled = true ### Processing batch None: [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set({"catto"}), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set({"catto"}), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set({"catto"}), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set({"catto"}), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} 1 {uid: 1, status: enqueued, details: { received_documents: 3, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 3, allow_index_creation: true }} ---------------------------------------------------------------------- ### Status: diff --git a/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_deletion/after_removing_the_documents.snap b/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_deletion/after_removing_the_documents.snap index 114df2852..0ee4d91e5 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_deletion/after_removing_the_documents.snap +++ b/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_deletion/after_removing_the_documents.snap @@ -1,17 +1,16 @@ --- source: crates/index-scheduler/src/lib.rs -snapshot_kind: text --- ### Autobatching Enabled = true ### Processing batch None: [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set({"catto"}), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set({"catto"}), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set({"catto"}), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set({"catto"}), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} 1 {uid: 1, batch_uid: 1, status: succeeded, details: { received_documents: 3, indexed_documents: Some(3) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 3, allow_index_creation: true }} 2 {uid: 2, batch_uid: 2, status: succeeded, details: { received_document_ids: 1, deleted_documents: Some(1) }, kind: DocumentDeletion { index_uid: "doggos", documents_ids: ["1"] }} -3 {uid: 3, batch_uid: 2, status: failed, error: ResponseError { code: 200, message: "Invalid type for filter subexpression: expected: String, Array, found: true.", error_code: "invalid_document_filter", error_type: "invalid_request", error_link: "https://docs.meilisearch.com/errors#invalid_document_filter" }, details: { original_filter: true, deleted_documents: Some(0) }, kind: DocumentDeletionByFilter { index_uid: "doggos", filter_expr: Bool(true) }} -4 {uid: 4, batch_uid: 2, status: failed, error: ResponseError { code: 200, message: "Attribute `id` is not filterable. Available filterable attributes are: `catto`.\n1:3 id = 2", error_code: "invalid_document_filter", error_type: "invalid_request", error_link: "https://docs.meilisearch.com/errors#invalid_document_filter" }, details: { original_filter: "id = 2", deleted_documents: Some(0) }, kind: DocumentDeletionByFilter { index_uid: "doggos", filter_expr: String("id = 2") }} +3 {uid: 3, batch_uid: 2, status: failed, error: ResponseError { code: 200, message: "Index `doggos`: Invalid type for filter subexpression: expected: String, Array, found: true.", error_code: "invalid_document_filter", error_type: "invalid_request", error_link: "https://docs.meilisearch.com/errors#invalid_document_filter" }, details: { original_filter: true, deleted_documents: Some(0) }, kind: DocumentDeletionByFilter { index_uid: "doggos", filter_expr: Bool(true) }} +4 {uid: 4, batch_uid: 2, status: failed, error: ResponseError { code: 200, message: "Index `doggos`: Attribute `id` is not filterable. Available filterable attributes are: `catto`.\n1:3 id = 2", error_code: "invalid_document_filter", error_type: "invalid_request", error_link: "https://docs.meilisearch.com/errors#invalid_document_filter" }, details: { original_filter: "id = 2", deleted_documents: Some(0) }, kind: DocumentDeletionByFilter { index_uid: "doggos", filter_expr: String("id = 2") }} 5 {uid: 5, batch_uid: 2, status: succeeded, details: { original_filter: "catto EXISTS", deleted_documents: Some(1) }, kind: DocumentDeletionByFilter { index_uid: "doggos", filter_expr: String("catto EXISTS") }} ---------------------------------------------------------------------- ### Status: diff --git a/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_deletion/registered_the_document_deletions.snap b/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_deletion/registered_the_document_deletions.snap index b2b368be4..43be57779 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_deletion/registered_the_document_deletions.snap +++ b/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_deletion/registered_the_document_deletions.snap @@ -1,13 +1,12 @@ --- source: crates/index-scheduler/src/lib.rs -snapshot_kind: text --- ### Autobatching Enabled = true ### Processing batch None: [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set({"catto"}), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set({"catto"}), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set({"catto"}), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set({"catto"}), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} 1 {uid: 1, batch_uid: 1, status: succeeded, details: { received_documents: 3, indexed_documents: Some(3) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 3, allow_index_creation: true }} 2 {uid: 2, status: enqueued, details: { received_document_ids: 1, deleted_documents: None }, kind: DocumentDeletion { index_uid: "doggos", documents_ids: ["1"] }} 3 {uid: 3, status: enqueued, details: { original_filter: true, deleted_documents: None }, kind: DocumentDeletionByFilter { index_uid: "doggos", filter_expr: Bool(true) }} diff --git a/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_deletion/registered_the_setting_and_document_addition.snap b/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_deletion/registered_the_setting_and_document_addition.snap index 9e1995fee..ca1866473 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_deletion/registered_the_setting_and_document_addition.snap +++ b/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_deletion/registered_the_setting_and_document_addition.snap @@ -1,13 +1,12 @@ --- source: crates/index-scheduler/src/lib.rs -snapshot_kind: text --- ### Autobatching Enabled = true ### Processing batch None: [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, status: enqueued, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set({"catto"}), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set({"catto"}), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, status: enqueued, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set({"catto"}), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set({"catto"}), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} 1 {uid: 1, status: enqueued, details: { received_documents: 3, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 3, allow_index_creation: true }} ---------------------------------------------------------------------- ### Status: diff --git a/crates/index-scheduler/src/snapshots/lib.rs/fail_in_update_task_after_process_batch_success_for_document_addition/after_batch_succeeded.snap b/crates/index-scheduler/src/snapshots/lib.rs/fail_in_update_task_after_process_batch_success_for_document_addition/after_batch_succeeded.snap index 0091af65b..8ab4d84dd 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/fail_in_update_task_after_process_batch_success_for_document_addition/after_batch_succeeded.snap +++ b/crates/index-scheduler/src/snapshots/lib.rs/fail_in_update_task_after_process_batch_success_for_document_addition/after_batch_succeeded.snap @@ -5,7 +5,7 @@ snapshot_kind: text ### Autobatching Enabled = true ### Processing batch Some(0): [0,] -{uid: 0, details: {"receivedDocuments":2,"indexedDocuments":null}, stats: {"totalNbTasks":2,"status":{"enqueued":2},"types":{"documentAdditionOrUpdate":2},"indexUids":{"doggos":2}}, } +{uid: 0, details: {"receivedDocuments":1,"indexedDocuments":null}, stats: {"totalNbTasks":1,"status":{"processing":1},"types":{"documentAdditionOrUpdate":1},"indexUids":{"doggos":1}}, } ---------------------------------------------------------------------- ### All Tasks: 0 {uid: 0, status: enqueued, details: { received_documents: 1, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} diff --git a/crates/index-scheduler/src/snapshots/lib.rs/fail_in_update_task_after_process_batch_success_for_document_addition/after_failing_to_commit.snap b/crates/index-scheduler/src/snapshots/lib.rs/fail_in_update_task_after_process_batch_success_for_document_addition/after_failing_to_commit.snap index 0091af65b..8ab4d84dd 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/fail_in_update_task_after_process_batch_success_for_document_addition/after_failing_to_commit.snap +++ b/crates/index-scheduler/src/snapshots/lib.rs/fail_in_update_task_after_process_batch_success_for_document_addition/after_failing_to_commit.snap @@ -5,7 +5,7 @@ snapshot_kind: text ### Autobatching Enabled = true ### Processing batch Some(0): [0,] -{uid: 0, details: {"receivedDocuments":2,"indexedDocuments":null}, stats: {"totalNbTasks":2,"status":{"enqueued":2},"types":{"documentAdditionOrUpdate":2},"indexUids":{"doggos":2}}, } +{uid: 0, details: {"receivedDocuments":1,"indexedDocuments":null}, stats: {"totalNbTasks":1,"status":{"processing":1},"types":{"documentAdditionOrUpdate":1},"indexUids":{"doggos":1}}, } ---------------------------------------------------------------------- ### All Tasks: 0 {uid: 0, status: enqueued, details: { received_documents: 1, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} diff --git a/crates/index-scheduler/src/snapshots/lib.rs/import_vectors/Intel to kefir succeeds.snap b/crates/index-scheduler/src/snapshots/lib.rs/import_vectors/Intel to kefir succeeds.snap index 11995b0bd..f581defa8 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/import_vectors/Intel to kefir succeeds.snap +++ b/crates/index-scheduler/src/snapshots/lib.rs/import_vectors/Intel to kefir succeeds.snap @@ -1,13 +1,12 @@ --- -source: crates/crates/index-scheduler/src/lib.rs -snapshot_kind: text +source: crates/index-scheduler/src/lib.rs --- ### Autobatching Enabled = true ### Processing batch None: [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} 1 {uid: 1, batch_uid: 1, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} 2 {uid: 2, batch_uid: 2, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: None, method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000001, documents_count: 1, allow_index_creation: true }} ---------------------------------------------------------------------- diff --git a/crates/index-scheduler/src/snapshots/lib.rs/import_vectors/Intel to kefir.snap b/crates/index-scheduler/src/snapshots/lib.rs/import_vectors/Intel to kefir.snap index 9c028d141..27522376f 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/import_vectors/Intel to kefir.snap +++ b/crates/index-scheduler/src/snapshots/lib.rs/import_vectors/Intel to kefir.snap @@ -1,13 +1,12 @@ --- -source: crates/crates/index-scheduler/src/lib.rs -snapshot_kind: text +source: crates/index-scheduler/src/lib.rs --- ### Autobatching Enabled = true ### Processing batch None: [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} 1 {uid: 1, batch_uid: 1, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} 2 {uid: 2, status: enqueued, details: { received_documents: 1, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: None, method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000001, documents_count: 1, allow_index_creation: true }} ---------------------------------------------------------------------- diff --git a/crates/index-scheduler/src/snapshots/lib.rs/import_vectors/adding Intel succeeds.snap b/crates/index-scheduler/src/snapshots/lib.rs/import_vectors/adding Intel succeeds.snap index 5c83f6cac..28504ffea 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/import_vectors/adding Intel succeeds.snap +++ b/crates/index-scheduler/src/snapshots/lib.rs/import_vectors/adding Intel succeeds.snap @@ -1,13 +1,12 @@ --- -source: crates/crates/index-scheduler/src/lib.rs -snapshot_kind: text +source: crates/index-scheduler/src/lib.rs --- ### Autobatching Enabled = true ### Processing batch None: [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} 1 {uid: 1, batch_uid: 1, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} ---------------------------------------------------------------------- ### Status: diff --git a/crates/index-scheduler/src/snapshots/lib.rs/import_vectors/after adding Intel.snap b/crates/index-scheduler/src/snapshots/lib.rs/import_vectors/after adding Intel.snap index c8f174c74..288f2bc88 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/import_vectors/after adding Intel.snap +++ b/crates/index-scheduler/src/snapshots/lib.rs/import_vectors/after adding Intel.snap @@ -1,13 +1,12 @@ --- source: crates/index-scheduler/src/lib.rs -snapshot_kind: text --- ### Autobatching Enabled = true ### Processing batch None: [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} 1 {uid: 1, status: enqueued, details: { received_documents: 1, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} ---------------------------------------------------------------------- ### Status: diff --git a/crates/index-scheduler/src/snapshots/lib.rs/import_vectors/after_registering_settings_task_vectors.snap b/crates/index-scheduler/src/snapshots/lib.rs/import_vectors/after_registering_settings_task_vectors.snap index f9e6df03e..ff63c0caf 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/import_vectors/after_registering_settings_task_vectors.snap +++ b/crates/index-scheduler/src/snapshots/lib.rs/import_vectors/after_registering_settings_task_vectors.snap @@ -1,13 +1,12 @@ --- source: crates/index-scheduler/src/lib.rs -snapshot_kind: text --- ### Autobatching Enabled = true ### Processing batch None: [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, status: enqueued, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, status: enqueued, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} ---------------------------------------------------------------------- ### Status: enqueued [0,] diff --git a/crates/index-scheduler/src/snapshots/lib.rs/import_vectors/settings_update_processed_vectors.snap b/crates/index-scheduler/src/snapshots/lib.rs/import_vectors/settings_update_processed_vectors.snap index 24d5fff27..77367f06b 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/import_vectors/settings_update_processed_vectors.snap +++ b/crates/index-scheduler/src/snapshots/lib.rs/import_vectors/settings_update_processed_vectors.snap @@ -1,13 +1,12 @@ --- source: crates/index-scheduler/src/lib.rs -snapshot_kind: text --- ### Autobatching Enabled = true ### Processing batch None: [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} ---------------------------------------------------------------------- ### Status: enqueued [] diff --git a/crates/index-scheduler/src/snapshots/lib.rs/insert_task_while_another_task_is_processing/after_batch_creation.snap b/crates/index-scheduler/src/snapshots/lib.rs/insert_task_while_another_task_is_processing/after_batch_creation.snap index aafef2fce..9d3f29c48 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/insert_task_while_another_task_is_processing/after_batch_creation.snap +++ b/crates/index-scheduler/src/snapshots/lib.rs/insert_task_while_another_task_is_processing/after_batch_creation.snap @@ -5,7 +5,7 @@ snapshot_kind: text ### Autobatching Enabled = true ### Processing batch Some(0): [0,] -{uid: 0, details: {"primaryKey":"id"}, stats: {"totalNbTasks":2,"status":{"enqueued":2},"types":{"indexCreation":2},"indexUids":{"index_a":2}}, } +{uid: 0, details: {"primaryKey":"id"}, stats: {"totalNbTasks":1,"status":{"processing":1},"types":{"indexCreation":1},"indexUids":{"index_a":1}}, } ---------------------------------------------------------------------- ### All Tasks: 0 {uid: 0, status: enqueued, details: { primary_key: Some("id") }, kind: IndexCreation { index_uid: "index_a", primary_key: Some("id") }} diff --git a/crates/index-scheduler/src/snapshots/lib.rs/insert_task_while_another_task_is_processing/registered_the_second_task.snap b/crates/index-scheduler/src/snapshots/lib.rs/insert_task_while_another_task_is_processing/registered_the_second_task.snap index 86fea2386..322bcf4ab 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/insert_task_while_another_task_is_processing/registered_the_second_task.snap +++ b/crates/index-scheduler/src/snapshots/lib.rs/insert_task_while_another_task_is_processing/registered_the_second_task.snap @@ -5,7 +5,7 @@ snapshot_kind: text ### Autobatching Enabled = true ### Processing batch Some(0): [0,] -{uid: 0, details: {"primaryKey":"id"}, stats: {"totalNbTasks":2,"status":{"enqueued":2},"types":{"indexCreation":2},"indexUids":{"index_a":2}}, } +{uid: 0, details: {"primaryKey":"id"}, stats: {"totalNbTasks":1,"status":{"processing":1},"types":{"indexCreation":1},"indexUids":{"index_a":1}}, } ---------------------------------------------------------------------- ### All Tasks: 0 {uid: 0, status: enqueued, details: { primary_key: Some("id") }, kind: IndexCreation { index_uid: "index_a", primary_key: Some("id") }} diff --git a/crates/index-scheduler/src/snapshots/lib.rs/insert_task_while_another_task_is_processing/registered_the_third_task.snap b/crates/index-scheduler/src/snapshots/lib.rs/insert_task_while_another_task_is_processing/registered_the_third_task.snap index ea910f491..aa047e3ff 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/insert_task_while_another_task_is_processing/registered_the_third_task.snap +++ b/crates/index-scheduler/src/snapshots/lib.rs/insert_task_while_another_task_is_processing/registered_the_third_task.snap @@ -5,7 +5,7 @@ snapshot_kind: text ### Autobatching Enabled = true ### Processing batch Some(0): [0,] -{uid: 0, details: {"primaryKey":"id"}, stats: {"totalNbTasks":2,"status":{"enqueued":2},"types":{"indexCreation":2},"indexUids":{"index_a":2}}, } +{uid: 0, details: {"primaryKey":"id"}, stats: {"totalNbTasks":1,"status":{"processing":1},"types":{"indexCreation":1},"indexUids":{"index_a":1}}, } ---------------------------------------------------------------------- ### All Tasks: 0 {uid: 0, status: enqueued, details: { primary_key: Some("id") }, kind: IndexCreation { index_uid: "index_a", primary_key: Some("id") }} diff --git a/crates/index-scheduler/src/snapshots/lib.rs/query_batches_simple/after-advancing-a-bit.snap b/crates/index-scheduler/src/snapshots/lib.rs/query_batches_simple/after-advancing-a-bit.snap index 869e38e57..bf5d0528c 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/query_batches_simple/after-advancing-a-bit.snap +++ b/crates/index-scheduler/src/snapshots/lib.rs/query_batches_simple/after-advancing-a-bit.snap @@ -5,7 +5,7 @@ snapshot_kind: text ### Autobatching Enabled = true ### Processing batch Some(1): [1,] -{uid: 1, details: {"primaryKey":"sheep"}, stats: {"totalNbTasks":2,"status":{"enqueued":2},"types":{"indexCreation":2},"indexUids":{"doggo":2}}, } +{uid: 1, details: {"primaryKey":"sheep"}, stats: {"totalNbTasks":1,"status":{"processing":1},"types":{"indexCreation":1},"indexUids":{"doggo":1}}, } ---------------------------------------------------------------------- ### All Tasks: 0 {uid: 0, batch_uid: 0, status: succeeded, details: { primary_key: Some("mouse") }, kind: IndexCreation { index_uid: "catto", primary_key: Some("mouse") }} diff --git a/crates/index-scheduler/src/snapshots/lib.rs/task_deletion_undeleteable/task_deletion_processing.snap b/crates/index-scheduler/src/snapshots/lib.rs/task_deletion_undeleteable/task_deletion_processing.snap index fce223c6c..85a0afc46 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/task_deletion_undeleteable/task_deletion_processing.snap +++ b/crates/index-scheduler/src/snapshots/lib.rs/task_deletion_undeleteable/task_deletion_processing.snap @@ -5,7 +5,7 @@ snapshot_kind: text ### Autobatching Enabled = true ### Processing batch Some(0): [3,] -{uid: 0, details: {"matchedTasks":2,"deletedTasks":null,"originalFilter":"test_query"}, stats: {"totalNbTasks":1,"status":{"enqueued":1},"types":{"taskDeletion":1},"indexUids":{}}, } +{uid: 0, details: {"matchedTasks":2,"deletedTasks":null,"originalFilter":"test_query"}, stats: {"totalNbTasks":1,"status":{"processing":1},"types":{"taskDeletion":1},"indexUids":{}}, } ---------------------------------------------------------------------- ### All Tasks: 0 {uid: 0, status: enqueued, details: { primary_key: Some("mouse") }, kind: IndexCreation { index_uid: "catto", primary_key: Some("mouse") }} diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_settings_update/after_registering_settings_task.snap b/crates/index-scheduler/src/snapshots/lib.rs/test_settings_update/after_registering_settings_task.snap index 22900371e..e2668fcea 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_settings_update/after_registering_settings_task.snap +++ b/crates/index-scheduler/src/snapshots/lib.rs/test_settings_update/after_registering_settings_task.snap @@ -1,13 +1,12 @@ --- source: crates/index-scheduler/src/lib.rs -snapshot_kind: text --- ### Autobatching Enabled = true ### Processing batch None: [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, status: enqueued, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, status: enqueued, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} ---------------------------------------------------------------------- ### Status: enqueued [0,] diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_settings_update/settings_update_processed.snap b/crates/index-scheduler/src/snapshots/lib.rs/test_settings_update/settings_update_processed.snap index dae9b38cd..7f08c0575 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_settings_update/settings_update_processed.snap +++ b/crates/index-scheduler/src/snapshots/lib.rs/test_settings_update/settings_update_processed.snap @@ -1,13 +1,12 @@ --- source: crates/index-scheduler/src/lib.rs -snapshot_kind: text --- ### Autobatching Enabled = true ### Processing batch None: [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} ---------------------------------------------------------------------- ### Status: enqueued [] diff --git a/crates/index-scheduler/src/utils.rs b/crates/index-scheduler/src/utils.rs index 1ca782f8c..1fcedfddf 100644 --- a/crates/index-scheduler/src/utils.rs +++ b/crates/index-scheduler/src/utils.rs @@ -67,7 +67,7 @@ impl ProcessingBatch { task.batch_uid = Some(self.uid); // We don't store the statuses in the map since they're all enqueued but we must // still store them in the stats since that can be displayed. - *self.stats.status.entry(task.status).or_default() += 1; + *self.stats.status.entry(Status::Processing).or_default() += 1; self.kinds.insert(task.kind.as_kind()); *self.stats.types.entry(task.kind.as_kind()).or_default() += 1; @@ -106,7 +106,7 @@ impl ProcessingBatch { self.stats.total_nb_tasks = 0; } - /// Update the timestamp of the tasks and the inner structure of this sturcture. + /// Update the timestamp of the tasks and the inner structure of this structure. pub fn update(&mut self, task: &mut Task) { // We must re-set this value in case we're dealing with a task that has been added between // the `processing` and `finished` state @@ -134,6 +134,7 @@ impl ProcessingBatch { pub fn to_batch(&self) -> Batch { Batch { uid: self.uid, + progress: None, details: self.details.clone(), stats: self.stats.clone(), started_at: self.started_at, @@ -187,6 +188,7 @@ impl IndexScheduler { &batch.uid, &Batch { uid: batch.uid, + progress: None, details: batch.details, stats: batch.stats, started_at: batch.started_at, @@ -273,7 +275,9 @@ impl IndexScheduler { .into_iter() .map(|batch_id| { if Some(batch_id) == processing.batch.as_ref().map(|batch| batch.uid) { - Ok(processing.batch.as_ref().unwrap().to_batch()) + let mut batch = processing.batch.as_ref().unwrap().to_batch(); + batch.progress = processing.get_progress_view(); + Ok(batch) } else { self.get_batch(rtxn, batch_id) .and_then(|task| task.ok_or(Error::CorruptedTaskQueue)) diff --git a/crates/meilisearch-auth/Cargo.toml b/crates/meilisearch-auth/Cargo.toml index ae0095ab4..591a40158 100644 --- a/crates/meilisearch-auth/Cargo.toml +++ b/crates/meilisearch-auth/Cargo.toml @@ -17,7 +17,7 @@ hmac = "0.12.1" maplit = "1.0.2" meilisearch-types = { path = "../meilisearch-types" } rand = "0.8.5" -roaring = { version = "0.10.6", features = ["serde"] } +roaring = { version = "0.10.7", features = ["serde"] } serde = { version = "1.0.204", features = ["derive"] } serde_json = { version = "1.0.120", features = ["preserve_order"] } sha2 = "0.10.8" diff --git a/crates/meilisearch-types/Cargo.toml b/crates/meilisearch-types/Cargo.toml index 349c06080..76d8d11ca 100644 --- a/crates/meilisearch-types/Cargo.toml +++ b/crates/meilisearch-types/Cargo.toml @@ -24,8 +24,9 @@ flate2 = "1.0.30" fst = "0.4.7" memmap2 = "0.9.4" milli = { path = "../milli" } -raw-collections = { git = "https://github.com/meilisearch/raw-collections.git", version = "0.1.0" } -roaring = { version = "0.10.6", features = ["serde"] } +bumparaw-collections = "0.1.2" +roaring = { version = "0.10.7", features = ["serde"] } +rustc-hash = "2.1.0" serde = { version = "1.0.204", features = ["derive"] } serde-cs = "0.2.4" serde_json = "1.0.120" diff --git a/crates/meilisearch-types/src/batch_view.rs b/crates/meilisearch-types/src/batch_view.rs index 5d800d897..08d25413c 100644 --- a/crates/meilisearch-types/src/batch_view.rs +++ b/crates/meilisearch-types/src/batch_view.rs @@ -1,16 +1,16 @@ +use milli::progress::ProgressView; use serde::Serialize; use time::{Duration, OffsetDateTime}; -use crate::{ - batches::{Batch, BatchId, BatchStats}, - task_view::DetailsView, - tasks::serialize_duration, -}; +use crate::batches::{Batch, BatchId, BatchStats}; +use crate::task_view::DetailsView; +use crate::tasks::serialize_duration; #[derive(Debug, Clone, Serialize)] #[serde(rename_all = "camelCase")] pub struct BatchView { pub uid: BatchId, + pub progress: Option, pub details: DetailsView, pub stats: BatchStats, #[serde(serialize_with = "serialize_duration", default)] @@ -25,6 +25,7 @@ impl BatchView { pub fn from_batch(batch: &Batch) -> Self { Self { uid: batch.uid, + progress: batch.progress.clone(), details: batch.details.clone(), stats: batch.stats.clone(), duration: batch.finished_at.map(|finished_at| finished_at - batch.started_at), diff --git a/crates/meilisearch-types/src/batches.rs b/crates/meilisearch-types/src/batches.rs index a60386e52..664dafa7a 100644 --- a/crates/meilisearch-types/src/batches.rs +++ b/crates/meilisearch-types/src/batches.rs @@ -1,12 +1,11 @@ use std::collections::BTreeMap; +use milli::progress::ProgressView; use serde::{Deserialize, Serialize}; use time::OffsetDateTime; -use crate::{ - task_view::DetailsView, - tasks::{Kind, Status}, -}; +use crate::task_view::DetailsView; +use crate::tasks::{Kind, Status}; pub type BatchId = u32; @@ -15,6 +14,8 @@ pub type BatchId = u32; pub struct Batch { pub uid: BatchId, + #[serde(skip)] + pub progress: Option, pub details: DetailsView, pub stats: BatchStats, diff --git a/crates/meilisearch-types/src/document_formats.rs b/crates/meilisearch-types/src/document_formats.rs index 096349448..70a0e6204 100644 --- a/crates/meilisearch-types/src/document_formats.rs +++ b/crates/meilisearch-types/src/document_formats.rs @@ -4,10 +4,11 @@ use std::io::{self, BufWriter}; use std::marker::PhantomData; use bumpalo::Bump; +use bumparaw_collections::RawMap; use memmap2::Mmap; use milli::documents::Error; use milli::Object; -use raw_collections::RawMap; +use rustc_hash::FxBuildHasher; use serde::de::{SeqAccess, Visitor}; use serde::{Deserialize, Deserializer}; use serde_json::error::Category; @@ -214,13 +215,13 @@ pub fn read_json(input: &File, output: impl io::Write) -> Result { // We memory map to be able to deserialize into a RawMap that // does not allocate when possible and only materialize the first/top level. let input = unsafe { Mmap::map(input).map_err(DocumentFormatError::Io)? }; - let mut doc_alloc = Bump::with_capacity(1024 * 1024 * 1024); // 1MiB + let mut doc_alloc = Bump::with_capacity(1024 * 1024); // 1MiB let mut out = BufWriter::new(output); let mut deserializer = serde_json::Deserializer::from_slice(&input); let res = array_each(&mut deserializer, |obj: &RawValue| { doc_alloc.reset(); - let map = RawMap::from_raw_value(obj, &doc_alloc)?; + let map = RawMap::from_raw_value_and_hasher(obj, FxBuildHasher, &doc_alloc)?; to_writer(&mut out, &map) }); let count = match res { @@ -250,26 +251,25 @@ pub fn read_json(input: &File, output: impl io::Write) -> Result { } } -/// Reads NDJSON from file and write it in NDJSON in a file checking it along the way. -pub fn read_ndjson(input: &File, output: impl io::Write) -> Result { +/// Reads NDJSON from file and checks it. +pub fn read_ndjson(input: &File) -> Result { // We memory map to be able to deserialize into a RawMap that // does not allocate when possible and only materialize the first/top level. let input = unsafe { Mmap::map(input).map_err(DocumentFormatError::Io)? }; - let mut output = BufWriter::new(output); - let mut bump = Bump::with_capacity(1024 * 1024); let mut count = 0; for result in serde_json::Deserializer::from_slice(&input).into_iter() { bump.reset(); - count += 1; - result - .and_then(|raw: &RawValue| { + match result { + Ok(raw) => { // try to deserialize as a map - let map = RawMap::from_raw_value(raw, &bump)?; - to_writer(&mut output, &map) - }) - .map_err(|e| DocumentFormatError::from((PayloadType::Ndjson, e)))?; + RawMap::from_raw_value_and_hasher(raw, FxBuildHasher, &bump) + .map_err(|e| DocumentFormatError::from((PayloadType::Ndjson, e)))?; + count += 1; + } + Err(e) => return Err(DocumentFormatError::from((PayloadType::Ndjson, e))), + } } Ok(count) diff --git a/crates/meilisearch-types/src/error.rs b/crates/meilisearch-types/src/error.rs index 00f88b7b4..0c4027899 100644 --- a/crates/meilisearch-types/src/error.rs +++ b/crates/meilisearch-types/src/error.rs @@ -279,6 +279,7 @@ InvalidSearchPage , InvalidRequest , BAD_REQUEST ; InvalidSearchQ , InvalidRequest , BAD_REQUEST ; InvalidFacetSearchQuery , InvalidRequest , BAD_REQUEST ; InvalidFacetSearchName , InvalidRequest , BAD_REQUEST ; +FacetSearchDisabled , InvalidRequest , BAD_REQUEST ; InvalidSearchVector , InvalidRequest , BAD_REQUEST ; InvalidSearchShowMatchesPosition , InvalidRequest , BAD_REQUEST ; InvalidSearchShowRankingScore , InvalidRequest , BAD_REQUEST ; @@ -290,6 +291,8 @@ InvalidSearchDistinct , InvalidRequest , BAD_REQUEST ; InvalidSettingsDisplayedAttributes , InvalidRequest , BAD_REQUEST ; InvalidSettingsDistinctAttribute , InvalidRequest , BAD_REQUEST ; InvalidSettingsProximityPrecision , InvalidRequest , BAD_REQUEST ; +InvalidSettingsFacetSearch , InvalidRequest , BAD_REQUEST ; +InvalidSettingsPrefixSearch , InvalidRequest , BAD_REQUEST ; InvalidSettingsFaceting , InvalidRequest , BAD_REQUEST ; InvalidSettingsFilterableAttributes , InvalidRequest , BAD_REQUEST ; InvalidSettingsPagination , InvalidRequest , BAD_REQUEST ; @@ -547,7 +550,7 @@ impl fmt::Display for deserr_codes::InvalidSimilarId { "the value of `id` is invalid. \ A document identifier can be of type integer or string, \ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_), \ - and can not be more than 512 bytes." + and can not be more than 511 bytes." ) } } diff --git a/crates/meilisearch-types/src/settings.rs b/crates/meilisearch-types/src/settings.rs index e3803fa28..b12dfc9a2 100644 --- a/crates/meilisearch-types/src/settings.rs +++ b/crates/meilisearch-types/src/settings.rs @@ -8,7 +8,7 @@ use std::str::FromStr; use deserr::{DeserializeError, Deserr, ErrorKind, MergeWithError, ValuePointerRef}; use fst::IntoStreamer; -use milli::index::IndexEmbeddingConfig; +use milli::index::{IndexEmbeddingConfig, PrefixSearch}; use milli::proximity::ProximityPrecision; use milli::update::Setting; use milli::{Criterion, CriterionError, Index, DEFAULT_VALUES_PER_FACET}; @@ -202,6 +202,12 @@ pub struct Settings { #[serde(default, skip_serializing_if = "Setting::is_not_set")] #[deserr(default, error = DeserrJsonError)] pub localized_attributes: Setting>, + #[serde(default, skip_serializing_if = "Setting::is_not_set")] + #[deserr(default, error = DeserrJsonError)] + pub facet_search: Setting, + #[serde(default, skip_serializing_if = "Setting::is_not_set")] + #[deserr(default, error = DeserrJsonError)] + pub prefix_search: Setting, #[serde(skip)] #[deserr(skip)] @@ -266,6 +272,8 @@ impl Settings { embedders: Setting::Reset, search_cutoff_ms: Setting::Reset, localized_attributes: Setting::Reset, + facet_search: Setting::Reset, + prefix_search: Setting::Reset, _kind: PhantomData, } } @@ -290,6 +298,8 @@ impl Settings { embedders, search_cutoff_ms, localized_attributes: localized_attributes_rules, + facet_search, + prefix_search, _kind, } = self; @@ -312,6 +322,8 @@ impl Settings { embedders, search_cutoff_ms, localized_attributes: localized_attributes_rules, + facet_search, + prefix_search, _kind: PhantomData, } } @@ -360,6 +372,8 @@ impl Settings { embedders: self.embedders, search_cutoff_ms: self.search_cutoff_ms, localized_attributes: self.localized_attributes, + facet_search: self.facet_search, + prefix_search: self.prefix_search, _kind: PhantomData, } } @@ -433,6 +447,8 @@ impl Settings { Setting::Set(this) } }, + prefix_search: other.prefix_search.or(self.prefix_search), + facet_search: other.facet_search.or(self.facet_search), _kind: PhantomData, } } @@ -469,6 +485,8 @@ pub fn apply_settings_to_builder( embedders, search_cutoff_ms, localized_attributes: localized_attributes_rules, + facet_search, + prefix_search, _kind, } = settings; @@ -657,6 +675,20 @@ pub fn apply_settings_to_builder( Setting::Reset => builder.reset_search_cutoff(), Setting::NotSet => (), } + + match prefix_search { + Setting::Set(prefix_search) => { + builder.set_prefix_search(PrefixSearch::from(*prefix_search)) + } + Setting::Reset => builder.reset_prefix_search(), + Setting::NotSet => (), + } + + match facet_search { + Setting::Set(facet_search) => builder.set_facet_search(*facet_search), + Setting::Reset => builder.reset_facet_search(), + Setting::NotSet => (), + } } pub enum SecretPolicy { @@ -755,6 +787,10 @@ pub fn settings( let localized_attributes_rules = index.localized_attributes_rules(rtxn)?; + let prefix_search = index.prefix_search(rtxn)?.map(PrefixSearchSettings::from); + + let facet_search = index.facet_search(rtxn)?; + let mut settings = Settings { displayed_attributes: match displayed_attributes { Some(attrs) => Setting::Set(attrs), @@ -791,13 +827,14 @@ pub fn settings( Some(rules) => Setting::Set(rules.into_iter().map(|r| r.into()).collect()), None => Setting::Reset, }, + prefix_search: Setting::Set(prefix_search.unwrap_or_default()), + facet_search: Setting::Set(facet_search), _kind: PhantomData, }; if let SecretPolicy::HideSecrets = secret_policy { settings.hide_secrets() } - Ok(settings) } @@ -964,6 +1001,32 @@ impl std::ops::Deref for WildcardSetting { } } +#[derive(Default, Debug, Clone, Copy, PartialEq, Eq, Deserr, Serialize, Deserialize)] +#[serde(deny_unknown_fields, rename_all = "camelCase")] +#[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)] +pub enum PrefixSearchSettings { + #[default] + IndexingTime, + Disabled, +} + +impl From for PrefixSearchSettings { + fn from(value: PrefixSearch) -> Self { + match value { + PrefixSearch::IndexingTime => PrefixSearchSettings::IndexingTime, + PrefixSearch::Disabled => PrefixSearchSettings::Disabled, + } + } +} +impl From for PrefixSearch { + fn from(value: PrefixSearchSettings) -> Self { + match value { + PrefixSearchSettings::IndexingTime => PrefixSearch::IndexingTime, + PrefixSearchSettings::Disabled => PrefixSearch::Disabled, + } + } +} + #[cfg(test)] pub(crate) mod test { use super::*; @@ -990,6 +1053,8 @@ pub(crate) mod test { embedders: Setting::NotSet, localized_attributes: Setting::NotSet, search_cutoff_ms: Setting::NotSet, + facet_search: Setting::NotSet, + prefix_search: Setting::NotSet, _kind: PhantomData::, }; @@ -1019,6 +1084,8 @@ pub(crate) mod test { embedders: Setting::NotSet, localized_attributes: Setting::NotSet, search_cutoff_ms: Setting::NotSet, + facet_search: Setting::NotSet, + prefix_search: Setting::NotSet, _kind: PhantomData::, }; diff --git a/crates/meilisearch-types/src/tasks.rs b/crates/meilisearch-types/src/tasks.rs index ebd28f526..c62f550ae 100644 --- a/crates/meilisearch-types/src/tasks.rs +++ b/crates/meilisearch-types/src/tasks.rs @@ -4,7 +4,6 @@ use std::fmt::{Display, Write}; use std::str::FromStr; use enum_iterator::Sequence; -use milli::update::new::indexer::document_changes::Progress; use milli::update::IndexDocumentsMethod; use milli::Object; use roaring::RoaringBitmap; @@ -41,62 +40,6 @@ pub struct Task { pub kind: KindWithContent, } -#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)] -#[serde(rename_all = "camelCase")] -pub struct TaskProgress { - pub current_step: &'static str, - pub finished_steps: u16, - pub total_steps: u16, - pub finished_substeps: Option, - pub total_substeps: Option, -} - -impl Default for TaskProgress { - fn default() -> Self { - Self::new() - } -} - -impl TaskProgress { - pub fn new() -> Self { - Self { - current_step: "start", - finished_steps: 0, - total_steps: 1, - finished_substeps: None, - total_substeps: None, - } - } - - pub fn update(&mut self, progress: Progress) -> TaskProgress { - if self.finished_steps > progress.finished_steps { - return *self; - } - - if self.current_step != progress.step_name { - self.current_step = progress.step_name - } - - self.total_steps = progress.total_steps; - - if self.finished_steps < progress.finished_steps { - self.finished_substeps = None; - self.total_substeps = None; - } - self.finished_steps = progress.finished_steps; - if let Some((finished_substeps, total_substeps)) = progress.finished_total_substep { - if let Some(task_finished_substeps) = self.finished_substeps { - if task_finished_substeps > finished_substeps { - return *self; - } - } - self.finished_substeps = Some(finished_substeps); - self.total_substeps = Some(total_substeps); - } - *self - } -} - impl Task { pub fn index_uid(&self) -> Option<&str> { use KindWithContent::*; diff --git a/crates/meilisearch/Cargo.toml b/crates/meilisearch/Cargo.toml index b11d90151..68ca8e136 100644 --- a/crates/meilisearch/Cargo.toml +++ b/crates/meilisearch/Cargo.toml @@ -57,7 +57,7 @@ meilisearch-types = { path = "../meilisearch-types" } mimalloc = { version = "0.1.43", default-features = false } mime = "0.3.17" num_cpus = "1.16.0" -obkv = { git = "https://github.com/kerollmops/obkv", branch = "unsized-kvreader" } +obkv = "0.3.0" once_cell = "1.19.0" ordered-float = "4.2.1" parking_lot = "0.12.3" @@ -103,7 +103,7 @@ tracing-subscriber = { version = "0.3.18", features = ["json"] } tracing-trace = { version = "0.1.0", path = "../tracing-trace" } tracing-actix-web = "0.7.11" build-info = { version = "1.7.0", path = "../build-info" } -roaring = "0.10.2" +roaring = "0.10.7" mopa-maintained = "0.2.3" [dev-dependencies] @@ -157,5 +157,5 @@ german = ["meilisearch-types/german"] turkish = ["meilisearch-types/turkish"] [package.metadata.mini-dashboard] -assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.15/build.zip" -sha1 = "d057600b4a839a2e0c0be7a372cd1b2683f3ca7e" +assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.16/build.zip" +sha1 = "68f83438a114aabbe76bc9fe480071e741996662" diff --git a/crates/meilisearch/src/error.rs b/crates/meilisearch/src/error.rs index 5c4ce171f..41d62507a 100644 --- a/crates/meilisearch/src/error.rs +++ b/crates/meilisearch/src/error.rs @@ -4,6 +4,7 @@ use byte_unit::{Byte, UnitType}; use meilisearch_types::document_formats::{DocumentFormatError, PayloadType}; use meilisearch_types::error::{Code, ErrorCode, ResponseError}; use meilisearch_types::index_uid::{IndexUid, IndexUidFormatError}; +use meilisearch_types::milli; use meilisearch_types::milli::OrderBy; use serde_json::Value; use tokio::task::JoinError; @@ -62,8 +63,11 @@ pub enum MeilisearchHttpError { HeedError(#[from] meilisearch_types::heed::Error), #[error(transparent)] IndexScheduler(#[from] index_scheduler::Error), - #[error(transparent)] - Milli(#[from] meilisearch_types::milli::Error), + #[error("{}", match .index_name { + Some(name) if !name.is_empty() => format!("Index `{}`: {error}", name), + _ => format!("{error}") + })] + Milli { error: milli::Error, index_name: Option }, #[error(transparent)] Payload(#[from] PayloadError), #[error(transparent)] @@ -76,6 +80,12 @@ pub enum MeilisearchHttpError { MissingSearchHybrid, } +impl MeilisearchHttpError { + pub(crate) fn from_milli(error: milli::Error, index_name: Option) -> Self { + Self::Milli { error, index_name } + } +} + impl ErrorCode for MeilisearchHttpError { fn error_code(&self) -> Code { match self { @@ -95,7 +105,7 @@ impl ErrorCode for MeilisearchHttpError { MeilisearchHttpError::SerdeJson(_) => Code::Internal, MeilisearchHttpError::HeedError(_) => Code::Internal, MeilisearchHttpError::IndexScheduler(e) => e.error_code(), - MeilisearchHttpError::Milli(e) => e.error_code(), + MeilisearchHttpError::Milli { error, .. } => error.error_code(), MeilisearchHttpError::Payload(e) => e.error_code(), MeilisearchHttpError::FileStore(_) => Code::Internal, MeilisearchHttpError::DocumentFormat(e) => e.error_code(), diff --git a/crates/meilisearch/src/lib.rs b/crates/meilisearch/src/lib.rs index 1406fca05..88d3419e3 100644 --- a/crates/meilisearch/src/lib.rs +++ b/crates/meilisearch/src/lib.rs @@ -397,6 +397,7 @@ fn import_dump( for index_reader in dump_reader.indexes()? { let mut index_reader = index_reader?; let metadata = index_reader.metadata(); + let uid = metadata.uid.clone(); tracing::info!("Importing index `{}`.", metadata.uid); let date = Some((metadata.created_at, metadata.updated_at)); @@ -434,7 +435,7 @@ fn import_dump( let reader = DocumentsBatchReader::from_reader(reader)?; let embedder_configs = index.embedding_configs(&wtxn)?; - let embedders = index_scheduler.embedders(embedder_configs)?; + let embedders = index_scheduler.embedders(uid, embedder_configs)?; let builder = milli::update::IndexDocuments::new( &mut wtxn, diff --git a/crates/meilisearch/src/main.rs b/crates/meilisearch/src/main.rs index c0652bf1e..ee3bbf430 100644 --- a/crates/meilisearch/src/main.rs +++ b/crates/meilisearch/src/main.rs @@ -20,14 +20,14 @@ use meilisearch::{ LogStderrType, Opt, SubscriberForSecondLayer, }; use meilisearch_auth::{generate_master_key, AuthController, MASTER_KEY_MIN_SIZE}; -use mimalloc::MiMalloc; use termcolor::{Color, ColorChoice, ColorSpec, StandardStream, WriteColor}; use tracing::level_filters::LevelFilter; use tracing_subscriber::layer::SubscriberExt as _; use tracing_subscriber::Layer; +#[cfg(not(windows))] #[global_allocator] -static ALLOC: MiMalloc = MiMalloc; +static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc; fn default_log_route_layer() -> LogRouteType { None.with_filter(tracing_subscriber::filter::Targets::new().with_target("", LevelFilter::OFF)) @@ -129,6 +129,11 @@ async fn try_main() -> anyhow::Result<()> { print_launch_resume(&opt, analytics.clone(), config_read_from); + tokio::spawn(async move { + tokio::signal::ctrl_c().await.unwrap(); + std::process::exit(130); + }); + run_http(index_scheduler, auth_controller, opt, log_handle, Arc::new(analytics)).await?; Ok(()) diff --git a/crates/meilisearch/src/routes/batches.rs b/crates/meilisearch/src/routes/batches.rs index 6faedc021..4d42cdd16 100644 --- a/crates/meilisearch/src/routes/batches.rs +++ b/crates/meilisearch/src/routes/batches.rs @@ -1,18 +1,18 @@ -use actix_web::{ - web::{self, Data}, - HttpResponse, -}; +use actix_web::web::{self, Data}; +use actix_web::HttpResponse; use deserr::actix_web::AwebQueryParameter; use index_scheduler::{IndexScheduler, Query}; -use meilisearch_types::{ - batch_view::BatchView, batches::BatchId, deserr::DeserrQueryParamError, error::ResponseError, - keys::actions, -}; +use meilisearch_types::batch_view::BatchView; +use meilisearch_types::batches::BatchId; +use meilisearch_types::deserr::DeserrQueryParamError; +use meilisearch_types::error::ResponseError; +use meilisearch_types::keys::actions; use serde::Serialize; -use crate::extractors::{authentication::GuardedData, sequential_extractor::SeqHandler}; - -use super::{tasks::TasksFilterQuery, ActionPolicy}; +use super::tasks::TasksFilterQuery; +use super::ActionPolicy; +use crate::extractors::authentication::GuardedData; +use crate::extractors::sequential_extractor::SeqHandler; pub fn configure(cfg: &mut web::ServiceConfig) { cfg.service(web::resource("").route(web::get().to(SeqHandler(get_batches)))) diff --git a/crates/meilisearch/src/routes/indexes/documents.rs b/crates/meilisearch/src/routes/indexes/documents.rs index 47f73ef42..5f79000bd 100644 --- a/crates/meilisearch/src/routes/indexes/documents.rs +++ b/crates/meilisearch/src/routes/indexes/documents.rs @@ -1,5 +1,5 @@ use std::collections::HashSet; -use std::io::ErrorKind; +use std::io::{ErrorKind, Seek as _}; use std::marker::PhantomData; use actix_web::http::header::CONTENT_TYPE; @@ -572,7 +572,7 @@ async fn document_addition( index_uid: IndexUid, primary_key: Option, csv_delimiter: Option, - mut body: Payload, + body: Payload, method: IndexDocumentsMethod, task_id: Option, dry_run: bool, @@ -609,54 +609,60 @@ async fn document_addition( }; let (uuid, mut update_file) = index_scheduler.create_update_file(dry_run)?; + let documents_count = match format { + PayloadType::Ndjson => { + let (path, file) = update_file.into_parts(); + let file = match file { + Some(file) => { + let (file, path) = file.into_parts(); + let mut file = copy_body_to_file(file, body, format).await?; + file.rewind().map_err(|e| { + index_scheduler::Error::FileStore(file_store::Error::IoError(e)) + })?; + Some(tempfile::NamedTempFile::from_parts(file, path)) + } + None => None, + }; - let temp_file = match tempfile() { - Ok(file) => file, - Err(e) => return Err(MeilisearchHttpError::Payload(ReceivePayload(Box::new(e)))), + let documents_count = tokio::task::spawn_blocking(move || { + let documents_count = file.as_ref().map_or(Ok(0), |ntf| { + read_ndjson(ntf.as_file()).map_err(MeilisearchHttpError::DocumentFormat) + })?; + + let update_file = file_store::File::from_parts(path, file); + update_file.persist()?; + + Ok(documents_count) + }) + .await?; + + Ok(documents_count) + } + PayloadType::Json | PayloadType::Csv { delimiter: _ } => { + let temp_file = match tempfile() { + Ok(file) => file, + Err(e) => return Err(MeilisearchHttpError::Payload(ReceivePayload(Box::new(e)))), + }; + + let read_file = copy_body_to_file(temp_file, body, format).await?; + tokio::task::spawn_blocking(move || { + let documents_count = match format { + PayloadType::Json => read_json(&read_file, &mut update_file)?, + PayloadType::Csv { delimiter } => { + read_csv(&read_file, &mut update_file, delimiter)? + } + PayloadType::Ndjson => { + unreachable!("We already wrote the user content into the update file") + } + }; + // we NEED to persist the file here because we moved the `udpate_file` in another task. + update_file.persist()?; + Ok(documents_count) + }) + .await + } }; - let async_file = File::from_std(temp_file); - let mut buffer = BufWriter::new(async_file); - - let mut buffer_write_size: usize = 0; - while let Some(result) = body.next().await { - let byte = result?; - - if byte.is_empty() && buffer_write_size == 0 { - return Err(MeilisearchHttpError::MissingPayload(format)); - } - - match buffer.write_all(&byte).await { - Ok(()) => buffer_write_size += 1, - Err(e) => return Err(MeilisearchHttpError::Payload(ReceivePayload(Box::new(e)))), - } - } - - if let Err(e) = buffer.flush().await { - return Err(MeilisearchHttpError::Payload(ReceivePayload(Box::new(e)))); - } - - if buffer_write_size == 0 { - return Err(MeilisearchHttpError::MissingPayload(format)); - } - - if let Err(e) = buffer.seek(std::io::SeekFrom::Start(0)).await { - return Err(MeilisearchHttpError::Payload(ReceivePayload(Box::new(e)))); - } - - let read_file = buffer.into_inner().into_std().await; - let documents_count = tokio::task::spawn_blocking(move || { - let documents_count = match format { - PayloadType::Json => read_json(&read_file, &mut update_file)?, - PayloadType::Csv { delimiter } => read_csv(&read_file, &mut update_file, delimiter)?, - PayloadType::Ndjson => read_ndjson(&read_file, &mut update_file)?, - }; - // we NEED to persist the file here because we moved the `udpate_file` in another task. - update_file.persist()?; - Ok(documents_count) - }) - .await; - let documents_count = match documents_count { Ok(Ok(documents_count)) => documents_count, // in this case the file has not possibly be persisted. @@ -703,6 +709,39 @@ async fn document_addition( Ok(task.into()) } +async fn copy_body_to_file( + output: std::fs::File, + mut body: Payload, + format: PayloadType, +) -> Result { + let async_file = File::from_std(output); + let mut buffer = BufWriter::new(async_file); + let mut buffer_write_size: usize = 0; + while let Some(result) = body.next().await { + let byte = result?; + + if byte.is_empty() && buffer_write_size == 0 { + return Err(MeilisearchHttpError::MissingPayload(format)); + } + + match buffer.write_all(&byte).await { + Ok(()) => buffer_write_size += 1, + Err(e) => return Err(MeilisearchHttpError::Payload(ReceivePayload(Box::new(e)))), + } + } + if let Err(e) = buffer.flush().await { + return Err(MeilisearchHttpError::Payload(ReceivePayload(Box::new(e)))); + } + if buffer_write_size == 0 { + return Err(MeilisearchHttpError::MissingPayload(format)); + } + if let Err(e) = buffer.seek(std::io::SeekFrom::Start(0)).await { + return Err(MeilisearchHttpError::Payload(ReceivePayload(Box::new(e)))); + } + let read_file = buffer.into_inner().into_std().await; + Ok(read_file) +} + pub async fn delete_documents_batch( index_scheduler: GuardedData, Data>, index_uid: web::Path, diff --git a/crates/meilisearch/src/routes/indexes/facet_search.rs b/crates/meilisearch/src/routes/indexes/facet_search.rs index 99a4a4f28..ff11f1305 100644 --- a/crates/meilisearch/src/routes/indexes/facet_search.rs +++ b/crates/meilisearch/src/routes/indexes/facet_search.rs @@ -185,7 +185,8 @@ pub async fn search( let index = index_scheduler.index(&index_uid)?; let features = index_scheduler.features(); - let search_kind = search_kind(&search_query, &index_scheduler, &index, features)?; + let search_kind = + search_kind(&search_query, &index_scheduler, index_uid.to_string(), &index, features)?; let permit = search_queue.try_get_search_permit().await?; let search_result = tokio::task::spawn_blocking(move || { perform_facet_search( diff --git a/crates/meilisearch/src/routes/indexes/mod.rs b/crates/meilisearch/src/routes/indexes/mod.rs index b36dbbbff..1355ac6c4 100644 --- a/crates/meilisearch/src/routes/indexes/mod.rs +++ b/crates/meilisearch/src/routes/indexes/mod.rs @@ -5,7 +5,7 @@ use actix_web::web::Data; use actix_web::{web, HttpRequest, HttpResponse}; use deserr::actix_web::{AwebJson, AwebQueryParameter}; use deserr::{DeserializeError, Deserr, ValuePointerRef}; -use index_scheduler::IndexScheduler; +use index_scheduler::{Error, IndexScheduler}; use meilisearch_types::deserr::query_params::Param; use meilisearch_types::deserr::{immutable_field_error, DeserrJsonError, DeserrQueryParamError}; use meilisearch_types::error::deserr_codes::*; @@ -109,7 +109,10 @@ pub async fn list_indexes( if !filters.is_index_authorized(uid) { return Ok(None); } - Ok(Some(IndexView::new(uid.to_string(), index)?)) + Ok(Some( + IndexView::new(uid.to_string(), index) + .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?, + )) })?; // Won't cause to open all indexes because IndexView doesn't keep the `Index` opened. let indexes: Vec = indexes.into_iter().flatten().collect(); diff --git a/crates/meilisearch/src/routes/indexes/search.rs b/crates/meilisearch/src/routes/indexes/search.rs index 966361e76..291193c4e 100644 --- a/crates/meilisearch/src/routes/indexes/search.rs +++ b/crates/meilisearch/src/routes/indexes/search.rs @@ -243,11 +243,19 @@ pub async fn search_with_url_query( let index = index_scheduler.index(&index_uid)?; let features = index_scheduler.features(); - let search_kind = search_kind(&query, index_scheduler.get_ref(), &index, features)?; + let search_kind = + search_kind(&query, index_scheduler.get_ref(), index_uid.to_string(), &index, features)?; let retrieve_vector = RetrieveVectors::new(query.retrieve_vectors, features)?; let permit = search_queue.try_get_search_permit().await?; let search_result = tokio::task::spawn_blocking(move || { - perform_search(&index, query, search_kind, retrieve_vector, index_scheduler.features()) + perform_search( + index_uid.to_string(), + &index, + query, + search_kind, + retrieve_vector, + index_scheduler.features(), + ) }) .await; permit.drop().await; @@ -287,12 +295,20 @@ pub async fn search_with_post( let features = index_scheduler.features(); - let search_kind = search_kind(&query, index_scheduler.get_ref(), &index, features)?; + let search_kind = + search_kind(&query, index_scheduler.get_ref(), index_uid.to_string(), &index, features)?; let retrieve_vectors = RetrieveVectors::new(query.retrieve_vectors, features)?; let permit = search_queue.try_get_search_permit().await?; let search_result = tokio::task::spawn_blocking(move || { - perform_search(&index, query, search_kind, retrieve_vectors, index_scheduler.features()) + perform_search( + index_uid.to_string(), + &index, + query, + search_kind, + retrieve_vectors, + index_scheduler.features(), + ) }) .await; permit.drop().await; @@ -314,6 +330,7 @@ pub async fn search_with_post( pub fn search_kind( query: &SearchQuery, index_scheduler: &IndexScheduler, + index_uid: String, index: &milli::Index, features: RoFeatures, ) -> Result { @@ -332,7 +349,7 @@ pub fn search_kind( (None, _, None) => Ok(SearchKind::KeywordOnly), // hybrid.semantic_ratio == 1.0 => vector (_, Some(HybridQuery { semantic_ratio, embedder }), v) if **semantic_ratio == 1.0 => { - SearchKind::semantic(index_scheduler, index, embedder, v.map(|v| v.len())) + SearchKind::semantic(index_scheduler, index_uid, index, embedder, v.map(|v| v.len())) } // hybrid.semantic_ratio == 0.0 => keyword (_, Some(HybridQuery { semantic_ratio, embedder: _ }), _) if **semantic_ratio == 0.0 => { @@ -340,13 +357,14 @@ pub fn search_kind( } // no query, hybrid, vector => semantic (None, Some(HybridQuery { semantic_ratio: _, embedder }), Some(v)) => { - SearchKind::semantic(index_scheduler, index, embedder, Some(v.len())) + SearchKind::semantic(index_scheduler, index_uid, index, embedder, Some(v.len())) } // query, no hybrid, no vector => keyword (Some(_), None, None) => Ok(SearchKind::KeywordOnly), // query, hybrid, maybe vector => hybrid (Some(_), Some(HybridQuery { semantic_ratio, embedder }), v) => SearchKind::hybrid( index_scheduler, + index_uid, index, embedder, **semantic_ratio, diff --git a/crates/meilisearch/src/routes/indexes/settings.rs b/crates/meilisearch/src/routes/indexes/settings.rs index a9d8d3053..b2922e5ff 100644 --- a/crates/meilisearch/src/routes/indexes/settings.rs +++ b/crates/meilisearch/src/routes/indexes/settings.rs @@ -17,6 +17,32 @@ use crate::extractors::authentication::GuardedData; use crate::routes::{get_task_id, is_dry_run, SummarizedTaskView}; use crate::Opt; +/// This macro generates the routes for the settings. +/// +/// It takes a list of settings and generates a module for each setting. +/// Each module contains the `get`, `update` and `delete` routes for the setting. +/// +/// It also generates a `configure` function that configures the routes for the settings. +macro_rules! make_setting_routes { + ($({route: $route:literal, update_verb: $update_verb:ident, value_type: $type:ty, err_type: $err_ty:ty, attr: $attr:ident, camelcase_attr: $camelcase_attr:literal, analytics: $analytics:ident},)*) => { + $( + make_setting_route!($route, $update_verb, $type, $err_ty, $attr, $camelcase_attr, $analytics); + )* + + pub fn configure(cfg: &mut web::ServiceConfig) { + use crate::extractors::sequential_extractor::SeqHandler; + cfg.service( + web::resource("") + .route(web::patch().to(SeqHandler(update_all))) + .route(web::get().to(SeqHandler(get_all))) + .route(web::delete().to(SeqHandler(delete_all)))) + $(.service($attr::resources()))*; + } + + pub const ALL_SETTINGS_NAMES: &[&str] = &[$(stringify!($attr)),*]; + }; +} + #[macro_export] macro_rules! make_setting_route { ($route:literal, $update_verb:ident, $type:ty, $err_ty:ty, $attr:ident, $camelcase_attr:literal, $analytics:ident) => { @@ -153,255 +179,227 @@ macro_rules! make_setting_route { }; } -make_setting_route!( - "/filterable-attributes", - put, - std::collections::BTreeSet, - meilisearch_types::deserr::DeserrJsonError< - meilisearch_types::error::deserr_codes::InvalidSettingsFilterableAttributes, - >, - filterable_attributes, - "filterableAttributes", - FilterableAttributesAnalytics -); - -make_setting_route!( - "/sortable-attributes", - put, - std::collections::BTreeSet, - meilisearch_types::deserr::DeserrJsonError< - meilisearch_types::error::deserr_codes::InvalidSettingsSortableAttributes, - >, - sortable_attributes, - "sortableAttributes", - SortableAttributesAnalytics -); - -make_setting_route!( - "/displayed-attributes", - put, - Vec, - meilisearch_types::deserr::DeserrJsonError< - meilisearch_types::error::deserr_codes::InvalidSettingsDisplayedAttributes, - >, - displayed_attributes, - "displayedAttributes", - DisplayedAttributesAnalytics -); - -make_setting_route!( - "/typo-tolerance", - patch, - meilisearch_types::settings::TypoSettings, - meilisearch_types::deserr::DeserrJsonError< - meilisearch_types::error::deserr_codes::InvalidSettingsTypoTolerance, - >, - typo_tolerance, - "typoTolerance", - TypoToleranceAnalytics -); - -make_setting_route!( - "/searchable-attributes", - put, - Vec, - meilisearch_types::deserr::DeserrJsonError< - meilisearch_types::error::deserr_codes::InvalidSettingsSearchableAttributes, - >, - searchable_attributes, - "searchableAttributes", - SearchableAttributesAnalytics -); - -make_setting_route!( - "/stop-words", - put, - std::collections::BTreeSet, - meilisearch_types::deserr::DeserrJsonError< - meilisearch_types::error::deserr_codes::InvalidSettingsStopWords, - >, - stop_words, - "stopWords", - StopWordsAnalytics -); - -make_setting_route!( - "/non-separator-tokens", - put, - std::collections::BTreeSet, - meilisearch_types::deserr::DeserrJsonError< - meilisearch_types::error::deserr_codes::InvalidSettingsNonSeparatorTokens, - >, - non_separator_tokens, - "nonSeparatorTokens", - NonSeparatorTokensAnalytics -); - -make_setting_route!( - "/separator-tokens", - put, - std::collections::BTreeSet, - meilisearch_types::deserr::DeserrJsonError< - meilisearch_types::error::deserr_codes::InvalidSettingsSeparatorTokens, - >, - separator_tokens, - "separatorTokens", - SeparatorTokensAnalytics -); - -make_setting_route!( - "/dictionary", - put, - std::collections::BTreeSet, - meilisearch_types::deserr::DeserrJsonError< - meilisearch_types::error::deserr_codes::InvalidSettingsDictionary, - >, - dictionary, - "dictionary", - DictionaryAnalytics -); - -make_setting_route!( - "/synonyms", - put, - std::collections::BTreeMap>, - meilisearch_types::deserr::DeserrJsonError< - meilisearch_types::error::deserr_codes::InvalidSettingsSynonyms, - >, - synonyms, - "synonyms", - SynonymsAnalytics -); - -make_setting_route!( - "/distinct-attribute", - put, - String, - meilisearch_types::deserr::DeserrJsonError< - meilisearch_types::error::deserr_codes::InvalidSettingsDistinctAttribute, - >, - distinct_attribute, - "distinctAttribute", - DistinctAttributeAnalytics -); - -make_setting_route!( - "/proximity-precision", - put, - meilisearch_types::settings::ProximityPrecisionView, - meilisearch_types::deserr::DeserrJsonError< - meilisearch_types::error::deserr_codes::InvalidSettingsProximityPrecision, - >, - proximity_precision, - "proximityPrecision", - ProximityPrecisionAnalytics -); - -make_setting_route!( - "/localized-attributes", - put, - Vec, - meilisearch_types::deserr::DeserrJsonError< - meilisearch_types::error::deserr_codes::InvalidSettingsLocalizedAttributes, - >, - localized_attributes, - "localizedAttributes", - LocalesAnalytics -); - -make_setting_route!( - "/ranking-rules", - put, - Vec, - meilisearch_types::deserr::DeserrJsonError< - meilisearch_types::error::deserr_codes::InvalidSettingsRankingRules, - >, - ranking_rules, - "rankingRules", - RankingRulesAnalytics -); - -make_setting_route!( - "/faceting", - patch, - meilisearch_types::settings::FacetingSettings, - meilisearch_types::deserr::DeserrJsonError< - meilisearch_types::error::deserr_codes::InvalidSettingsFaceting, - >, - faceting, - "faceting", - FacetingAnalytics -); - -make_setting_route!( - "/pagination", - patch, - meilisearch_types::settings::PaginationSettings, - meilisearch_types::deserr::DeserrJsonError< - meilisearch_types::error::deserr_codes::InvalidSettingsPagination, - >, - pagination, - "pagination", - PaginationAnalytics -); - -make_setting_route!( - "/embedders", - patch, - std::collections::BTreeMap>, - meilisearch_types::deserr::DeserrJsonError< - meilisearch_types::error::deserr_codes::InvalidSettingsEmbedders, - >, - embedders, - "embedders", - EmbeddersAnalytics -); - -make_setting_route!( - "/search-cutoff-ms", - put, - u64, - meilisearch_types::deserr::DeserrJsonError< - meilisearch_types::error::deserr_codes::InvalidSettingsSearchCutoffMs, - >, - search_cutoff_ms, - "searchCutoffMs", - SearchCutoffMsAnalytics -); - -macro_rules! generate_configure { - ($($mod:ident),*) => { - pub fn configure(cfg: &mut web::ServiceConfig) { - use crate::extractors::sequential_extractor::SeqHandler; - cfg.service( - web::resource("") - .route(web::patch().to(SeqHandler(update_all))) - .route(web::get().to(SeqHandler(get_all))) - .route(web::delete().to(SeqHandler(delete_all)))) - $(.service($mod::resources()))*; - } - }; -} - -generate_configure!( - filterable_attributes, - sortable_attributes, - displayed_attributes, - localized_attributes, - searchable_attributes, - distinct_attribute, - proximity_precision, - stop_words, - separator_tokens, - non_separator_tokens, - dictionary, - synonyms, - ranking_rules, - typo_tolerance, - pagination, - faceting, - embedders, - search_cutoff_ms +make_setting_routes!( + { + route: "/filterable-attributes", + update_verb: put, + value_type: std::collections::BTreeSet, + err_type: meilisearch_types::deserr::DeserrJsonError< + meilisearch_types::error::deserr_codes::InvalidSettingsFilterableAttributes, + >, + attr: filterable_attributes, + camelcase_attr: "filterableAttributes", + analytics: FilterableAttributesAnalytics + }, + { + route: "/sortable-attributes", + update_verb: put, + value_type: std::collections::BTreeSet, + err_type: meilisearch_types::deserr::DeserrJsonError< + meilisearch_types::error::deserr_codes::InvalidSettingsSortableAttributes, + >, + attr: sortable_attributes, + camelcase_attr: "sortableAttributes", + analytics: SortableAttributesAnalytics + }, + { + route: "/displayed-attributes", + update_verb: put, + value_type: Vec, + err_type: meilisearch_types::deserr::DeserrJsonError< + meilisearch_types::error::deserr_codes::InvalidSettingsDisplayedAttributes, + >, + attr: displayed_attributes, + camelcase_attr: "displayedAttributes", + analytics: DisplayedAttributesAnalytics + }, + { + route: "/typo-tolerance", + update_verb: patch, + value_type: meilisearch_types::settings::TypoSettings, + err_type: meilisearch_types::deserr::DeserrJsonError< + meilisearch_types::error::deserr_codes::InvalidSettingsTypoTolerance, + >, + attr: typo_tolerance, + camelcase_attr: "typoTolerance", + analytics: TypoToleranceAnalytics + }, + { + route: "/searchable-attributes", + update_verb: put, + value_type: Vec, + err_type: meilisearch_types::deserr::DeserrJsonError< + meilisearch_types::error::deserr_codes::InvalidSettingsSearchableAttributes, + >, + attr: searchable_attributes, + camelcase_attr: "searchableAttributes", + analytics: SearchableAttributesAnalytics + }, + { + route: "/stop-words", + update_verb: put, + value_type: std::collections::BTreeSet, + err_type: meilisearch_types::deserr::DeserrJsonError< + meilisearch_types::error::deserr_codes::InvalidSettingsStopWords, + >, + attr: stop_words, + camelcase_attr: "stopWords", + analytics: StopWordsAnalytics + }, + { + route: "/non-separator-tokens", + update_verb: put, + value_type: std::collections::BTreeSet, + err_type: meilisearch_types::deserr::DeserrJsonError< + meilisearch_types::error::deserr_codes::InvalidSettingsNonSeparatorTokens, + >, + attr: non_separator_tokens, + camelcase_attr: "nonSeparatorTokens", + analytics: NonSeparatorTokensAnalytics + }, + { + route: "/separator-tokens", + update_verb: put, + value_type: std::collections::BTreeSet, + err_type: meilisearch_types::deserr::DeserrJsonError< + meilisearch_types::error::deserr_codes::InvalidSettingsSeparatorTokens, + >, + attr: separator_tokens, + camelcase_attr: "separatorTokens", + analytics: SeparatorTokensAnalytics + }, + { + route: "/dictionary", + update_verb: put, + value_type: std::collections::BTreeSet, + err_type: meilisearch_types::deserr::DeserrJsonError< + meilisearch_types::error::deserr_codes::InvalidSettingsDictionary, + >, + attr: dictionary, + camelcase_attr: "dictionary", + analytics: DictionaryAnalytics + }, + { + route: "/synonyms", + update_verb: put, + value_type: std::collections::BTreeMap>, + err_type: meilisearch_types::deserr::DeserrJsonError< + meilisearch_types::error::deserr_codes::InvalidSettingsSynonyms, + >, + attr: synonyms, + camelcase_attr: "synonyms", + analytics: SynonymsAnalytics + }, + { + route: "/distinct-attribute", + update_verb: put, + value_type: String, + err_type: meilisearch_types::deserr::DeserrJsonError< + meilisearch_types::error::deserr_codes::InvalidSettingsDistinctAttribute, + >, + attr: distinct_attribute, + camelcase_attr: "distinctAttribute", + analytics: DistinctAttributeAnalytics + }, + { + route: "/proximity-precision", + update_verb: put, + value_type: meilisearch_types::settings::ProximityPrecisionView, + err_type: meilisearch_types::deserr::DeserrJsonError< + meilisearch_types::error::deserr_codes::InvalidSettingsProximityPrecision, + >, + attr: proximity_precision, + camelcase_attr: "proximityPrecision", + analytics: ProximityPrecisionAnalytics + }, + { + route: "/localized-attributes", + update_verb: put, + value_type: Vec, + err_type: meilisearch_types::deserr::DeserrJsonError< + meilisearch_types::error::deserr_codes::InvalidSettingsLocalizedAttributes, + >, + attr: localized_attributes, + camelcase_attr: "localizedAttributes", + analytics: LocalesAnalytics + }, + { + route: "/ranking-rules", + update_verb: put, + value_type: Vec, + err_type: meilisearch_types::deserr::DeserrJsonError< + meilisearch_types::error::deserr_codes::InvalidSettingsRankingRules, + >, + attr: ranking_rules, + camelcase_attr: "rankingRules", + analytics: RankingRulesAnalytics + }, + { + route: "/faceting", + update_verb: patch, + value_type: meilisearch_types::settings::FacetingSettings, + err_type: meilisearch_types::deserr::DeserrJsonError< + meilisearch_types::error::deserr_codes::InvalidSettingsFaceting, + >, + attr: faceting, + camelcase_attr: "faceting", + analytics: FacetingAnalytics + }, + { + route: "/pagination", + update_verb: patch, + value_type: meilisearch_types::settings::PaginationSettings, + err_type: meilisearch_types::deserr::DeserrJsonError< + meilisearch_types::error::deserr_codes::InvalidSettingsPagination, + >, + attr: pagination, + camelcase_attr: "pagination", + analytics: PaginationAnalytics + }, + { + route: "/embedders", + update_verb: patch, + value_type: std::collections::BTreeMap>, + err_type: meilisearch_types::deserr::DeserrJsonError< + meilisearch_types::error::deserr_codes::InvalidSettingsEmbedders, + >, + attr: embedders, + camelcase_attr: "embedders", + analytics: EmbeddersAnalytics + }, + { + route: "/search-cutoff-ms", + update_verb: put, + value_type: u64, + err_type: meilisearch_types::deserr::DeserrJsonError< + meilisearch_types::error::deserr_codes::InvalidSettingsSearchCutoffMs, + >, + attr: search_cutoff_ms, + camelcase_attr: "searchCutoffMs", + analytics: SearchCutoffMsAnalytics + }, + { + route: "/facet-search", + update_verb: put, + value_type: bool, + err_type: meilisearch_types::deserr::DeserrJsonError< + meilisearch_types::error::deserr_codes::InvalidSettingsFacetSearch, + >, + attr: facet_search, + camelcase_attr: "facetSearch", + analytics: FacetSearchAnalytics + }, + { + route: "/prefix-search", + update_verb: put, + value_type: meilisearch_types::settings::PrefixSearchSettings, + err_type: meilisearch_types::deserr::DeserrJsonError< + meilisearch_types::error::deserr_codes::InvalidSettingsPrefixSearch, + >, + attr: prefix_search, + camelcase_attr: "prefixSearch", + analytics: PrefixSearchAnalytics + }, ); pub async fn update_all( @@ -456,6 +454,8 @@ pub async fn update_all( non_separator_tokens: NonSeparatorTokensAnalytics::new( new_settings.non_separator_tokens.as_ref().set(), ), + facet_search: FacetSearchAnalytics::new(new_settings.facet_search.as_ref().set()), + prefix_search: PrefixSearchAnalytics::new(new_settings.prefix_search.as_ref().set()), }, &req, ); diff --git a/crates/meilisearch/src/routes/indexes/settings_analytics.rs b/crates/meilisearch/src/routes/indexes/settings_analytics.rs index 32bddcbdd..ddca2c00a 100644 --- a/crates/meilisearch/src/routes/indexes/settings_analytics.rs +++ b/crates/meilisearch/src/routes/indexes/settings_analytics.rs @@ -10,7 +10,8 @@ use meilisearch_types::locales::{Locale, LocalizedAttributesRuleView}; use meilisearch_types::milli::update::Setting; use meilisearch_types::milli::vector::settings::EmbeddingSettings; use meilisearch_types::settings::{ - FacetingSettings, PaginationSettings, ProximityPrecisionView, RankingRuleView, TypoSettings, + FacetingSettings, PaginationSettings, PrefixSearchSettings, ProximityPrecisionView, + RankingRuleView, TypoSettings, }; use serde::Serialize; @@ -36,6 +37,8 @@ pub struct SettingsAnalytics { pub dictionary: DictionaryAnalytics, pub separator_tokens: SeparatorTokensAnalytics, pub non_separator_tokens: NonSeparatorTokensAnalytics, + pub facet_search: FacetSearchAnalytics, + pub prefix_search: PrefixSearchAnalytics, } impl Aggregate for SettingsAnalytics { @@ -183,6 +186,14 @@ impl Aggregate for SettingsAnalytics { non_separator_tokens: NonSeparatorTokensAnalytics { total: new.non_separator_tokens.total.or(self.non_separator_tokens.total), }, + facet_search: FacetSearchAnalytics { + set: new.facet_search.set | self.facet_search.set, + value: new.facet_search.value.or(self.facet_search.value), + }, + prefix_search: PrefixSearchAnalytics { + set: new.prefix_search.set | self.prefix_search.set, + value: new.prefix_search.value.or(self.prefix_search.value), + }, }) } @@ -620,3 +631,35 @@ impl NonSeparatorTokensAnalytics { SettingsAnalytics { non_separator_tokens: self, ..Default::default() } } } + +#[derive(Serialize, Default)] +pub struct FacetSearchAnalytics { + pub set: bool, + pub value: Option, +} + +impl FacetSearchAnalytics { + pub fn new(settings: Option<&bool>) -> Self { + Self { set: settings.is_some(), value: settings.copied() } + } + + pub fn into_settings(self) -> SettingsAnalytics { + SettingsAnalytics { facet_search: self, ..Default::default() } + } +} + +#[derive(Serialize, Default)] +pub struct PrefixSearchAnalytics { + pub set: bool, + pub value: Option, +} + +impl PrefixSearchAnalytics { + pub fn new(settings: Option<&PrefixSearchSettings>) -> Self { + Self { set: settings.is_some(), value: settings.cloned() } + } + + pub fn into_settings(self) -> SettingsAnalytics { + SettingsAnalytics { prefix_search: self, ..Default::default() } + } +} diff --git a/crates/meilisearch/src/routes/indexes/similar.rs b/crates/meilisearch/src/routes/indexes/similar.rs index 79f42f0aa..f47771061 100644 --- a/crates/meilisearch/src/routes/indexes/similar.rs +++ b/crates/meilisearch/src/routes/indexes/similar.rs @@ -103,8 +103,13 @@ async fn similar( let index = index_scheduler.index(&index_uid)?; - let (embedder_name, embedder, quantized) = - SearchKind::embedder(&index_scheduler, &index, &query.embedder, None)?; + let (embedder_name, embedder, quantized) = SearchKind::embedder( + &index_scheduler, + index_uid.to_string(), + &index, + &query.embedder, + None, + )?; tokio::task::spawn_blocking(move || { perform_similar( diff --git a/crates/meilisearch/src/routes/multi_search.rs b/crates/meilisearch/src/routes/multi_search.rs index f8b1bc6ee..a2db0b22b 100644 --- a/crates/meilisearch/src/routes/multi_search.rs +++ b/crates/meilisearch/src/routes/multi_search.rs @@ -125,14 +125,28 @@ pub async fn multi_search_with_post( }) .with_index(query_index)?; - let search_kind = - search_kind(&query, index_scheduler.get_ref(), &index, features) - .with_index(query_index)?; + let index_uid_str = index_uid.to_string(); + + let search_kind = search_kind( + &query, + index_scheduler.get_ref(), + index_uid_str.clone(), + &index, + features, + ) + .with_index(query_index)?; let retrieve_vector = RetrieveVectors::new(query.retrieve_vectors, features) .with_index(query_index)?; let search_result = tokio::task::spawn_blocking(move || { - perform_search(&index, query, search_kind, retrieve_vector, features) + perform_search( + index_uid_str.clone(), + &index, + query, + search_kind, + retrieve_vector, + features, + ) }) .await .with_index(query_index)?; diff --git a/crates/meilisearch/src/search/federated.rs b/crates/meilisearch/src/search/federated.rs index 5279c26bb..c1c6bb7d7 100644 --- a/crates/meilisearch/src/search/federated.rs +++ b/crates/meilisearch/src/search/federated.rs @@ -560,7 +560,8 @@ pub fn perform_federated_search( // use an immediately invoked lambda to capture the result without returning from the function let res: Result<(), ResponseError> = (|| { - let search_kind = search_kind(&query, index_scheduler, &index, features)?; + let search_kind = + search_kind(&query, index_scheduler, index_uid.to_string(), &index, features)?; let canonicalization_kind = match (&search_kind, &query.q) { (SearchKind::SemanticOnly { .. }, _) => { @@ -636,7 +637,8 @@ pub fn perform_federated_search( search.offset(0); search.limit(required_hit_count); - let (result, _semantic_hit_count) = super::search_from_kind(search_kind, search)?; + let (result, _semantic_hit_count) = + super::search_from_kind(index_uid.to_string(), search_kind, search)?; let format = AttributesFormat { attributes_to_retrieve: query.attributes_to_retrieve, retrieve_vectors, @@ -670,7 +672,10 @@ pub fn perform_federated_search( let formatter_builder = HitMaker::formatter_builder(matching_words, tokenizer); - let hit_maker = HitMaker::new(&index, &rtxn, format, formatter_builder)?; + let hit_maker = + HitMaker::new(&index, &rtxn, format, formatter_builder).map_err(|e| { + MeilisearchHttpError::from_milli(e, Some(index_uid.to_string())) + })?; results_by_query.push(SearchResultByQuery { federation_options, diff --git a/crates/meilisearch/src/search/mod.rs b/crates/meilisearch/src/search/mod.rs index 2e8342612..00285c4ef 100644 --- a/crates/meilisearch/src/search/mod.rs +++ b/crates/meilisearch/src/search/mod.rs @@ -19,7 +19,9 @@ use meilisearch_types::locales::Locale; use meilisearch_types::milli::score_details::{ScoreDetails, ScoringStrategy}; use meilisearch_types::milli::vector::parsed_vectors::ExplicitVectors; use meilisearch_types::milli::vector::Embedder; -use meilisearch_types::milli::{FacetValueHit, OrderBy, SearchForFacetValues, TimeBudget}; +use meilisearch_types::milli::{ + FacetValueHit, InternalError, OrderBy, SearchForFacetValues, TimeBudget, +}; use meilisearch_types::settings::DEFAULT_PAGINATION_MAX_TOTAL_HITS; use meilisearch_types::{milli, Document}; use milli::tokenizer::{Language, TokenizerBuilder}; @@ -283,35 +285,38 @@ pub enum SearchKind { impl SearchKind { pub(crate) fn semantic( index_scheduler: &index_scheduler::IndexScheduler, + index_uid: String, index: &Index, embedder_name: &str, vector_len: Option, ) -> Result { let (embedder_name, embedder, quantized) = - Self::embedder(index_scheduler, index, embedder_name, vector_len)?; + Self::embedder(index_scheduler, index_uid, index, embedder_name, vector_len)?; Ok(Self::SemanticOnly { embedder_name, embedder, quantized }) } pub(crate) fn hybrid( index_scheduler: &index_scheduler::IndexScheduler, + index_uid: String, index: &Index, embedder_name: &str, semantic_ratio: f32, vector_len: Option, ) -> Result { let (embedder_name, embedder, quantized) = - Self::embedder(index_scheduler, index, embedder_name, vector_len)?; + Self::embedder(index_scheduler, index_uid, index, embedder_name, vector_len)?; Ok(Self::Hybrid { embedder_name, embedder, quantized, semantic_ratio }) } pub(crate) fn embedder( index_scheduler: &index_scheduler::IndexScheduler, + index_uid: String, index: &Index, embedder_name: &str, vector_len: Option, ) -> Result<(String, Arc, bool), ResponseError> { let embedder_configs = index.embedding_configs(&index.read_txn()?)?; - let embedders = index_scheduler.embedders(embedder_configs)?; + let embedders = index_scheduler.embedders(index_uid, embedder_configs)?; let (embedder, _, quantized) = embedders .get(embedder_name) @@ -892,6 +897,7 @@ fn prepare_search<'t>( } pub fn perform_search( + index_uid: String, index: &Index, query: SearchQuery, search_kind: SearchKind, @@ -918,7 +924,7 @@ pub fn perform_search( used_negative_operator, }, semantic_hit_count, - ) = search_from_kind(search_kind, search)?; + ) = search_from_kind(index_uid, search_kind, search)?; let SearchQuery { q, @@ -1071,17 +1077,27 @@ fn compute_facet_distribution_stats>( } pub fn search_from_kind( + index_uid: String, search_kind: SearchKind, search: milli::Search<'_>, ) -> Result<(milli::SearchResult, Option), MeilisearchHttpError> { let (milli_result, semantic_hit_count) = match &search_kind { - SearchKind::KeywordOnly => (search.execute()?, None), + SearchKind::KeywordOnly => { + let results = search + .execute() + .map_err(|e| MeilisearchHttpError::from_milli(e, Some(index_uid.to_string())))?; + (results, None) + } SearchKind::SemanticOnly { .. } => { - let results = search.execute()?; + let results = search + .execute() + .map_err(|e| MeilisearchHttpError::from_milli(e, Some(index_uid.to_string())))?; let semantic_hit_count = results.document_scores.len() as u32; (results, Some(semantic_hit_count)) } - SearchKind::Hybrid { semantic_ratio, .. } => search.execute_hybrid(*semantic_ratio)?, + SearchKind::Hybrid { semantic_ratio, .. } => search + .execute_hybrid(*semantic_ratio) + .map_err(|e| MeilisearchHttpError::from_milli(e, Some(index_uid)))?, }; Ok((milli_result, semantic_hit_count)) } @@ -1183,7 +1199,7 @@ impl<'a> HitMaker<'a> { rtxn: &'a RoTxn<'a>, format: AttributesFormat, mut formatter_builder: MatcherBuilder<'a>, - ) -> Result { + ) -> milli::Result { formatter_builder.crop_marker(format.crop_marker); formatter_builder.highlight_prefix(format.highlight_pre_tag); formatter_builder.highlight_suffix(format.highlight_post_tag); @@ -1278,11 +1294,7 @@ impl<'a> HitMaker<'a> { }) } - pub fn make_hit( - &self, - id: u32, - score: &[ScoreDetails], - ) -> Result { + pub fn make_hit(&self, id: u32, score: &[ScoreDetails]) -> milli::Result { let (_, obkv) = self.index.iter_documents(self.rtxn, std::iter::once(id))?.next().unwrap()?; @@ -1325,7 +1337,10 @@ impl<'a> HitMaker<'a> { .is_some_and(|conf| conf.user_provided.contains(id)); let embeddings = ExplicitVectors { embeddings: Some(vector.into()), regenerate: !user_provided }; - vectors.insert(name, serde_json::to_value(embeddings)?); + vectors.insert( + name, + serde_json::to_value(embeddings).map_err(InternalError::SerdeJson)?, + ); } document.insert("_vectors".into(), vectors.into()); } @@ -1371,7 +1386,7 @@ fn make_hits<'a>( format: AttributesFormat, matching_words: milli::MatchingWords, documents_ids_scores: impl Iterator)> + 'a, -) -> Result, MeilisearchHttpError> { +) -> milli::Result> { let mut documents = Vec::new(); let dictionary = index.dictionary(rtxn)?; @@ -1409,6 +1424,13 @@ pub fn perform_facet_search( None => TimeBudget::default(), }; + if !index.facet_search(&rtxn)? { + return Err(ResponseError::from_msg( + "The facet search is disabled for this index".to_string(), + Code::FacetSearchDisabled, + )); + } + // In the faceted search context, we want to use the intersection between the locales provided by the user // and the locales of the facet string. // If the facet string is not localized, we **ignore** the locales provided by the user because the facet data has no locale. @@ -1692,12 +1714,12 @@ fn make_document( displayed_attributes: &BTreeSet, field_ids_map: &FieldsIdsMap, obkv: &obkv::KvReaderU16, -) -> Result { +) -> milli::Result { let mut document = serde_json::Map::new(); // recreate the original json for (key, value) in obkv.iter() { - let value = serde_json::from_slice(value)?; + let value = serde_json::from_slice(value).map_err(InternalError::SerdeJson)?; let key = field_ids_map.name(key).expect("Missing field name").to_string(); document.insert(key, value); @@ -1722,7 +1744,7 @@ fn format_fields( displayable_ids: &BTreeSet, locales: Option<&[Language]>, localized_attributes: &[LocalizedAttributesRule], -) -> Result<(Option, Document), MeilisearchHttpError> { +) -> milli::Result<(Option, Document)> { let mut matches_position = compute_matches.then(BTreeMap::new); let mut document = document.clone(); @@ -1900,5 +1922,5 @@ fn parse_filter_array(arr: &[Value]) -> Result, MeilisearchHttpEr } } - Ok(Filter::from_array(ands)?) + Filter::from_array(ands).map_err(|e| MeilisearchHttpError::from_milli(e, None)) } diff --git a/crates/meilisearch/tests/batches/mod.rs b/crates/meilisearch/tests/batches/mod.rs index a7980a507..49b83360b 100644 --- a/crates/meilisearch/tests/batches/mod.rs +++ b/crates/meilisearch/tests/batches/mod.rs @@ -223,7 +223,7 @@ async fn list_batches_status_and_type_filtered() { } #[actix_rt::test] -async fn get_batch_filter_error() { +async fn list_batch_filter_error() { let server = Server::new().await; let (response, code) = server.batches_filter("lol=pied").await; @@ -283,6 +283,7 @@ async fn test_summarized_document_addition_or_update() { @r#" { "uid": 0, + "progress": null, "details": { "receivedDocuments": 1, "indexedDocuments": 1 @@ -313,6 +314,7 @@ async fn test_summarized_document_addition_or_update() { @r#" { "uid": 1, + "progress": null, "details": { "receivedDocuments": 1, "indexedDocuments": 1 @@ -348,6 +350,7 @@ async fn test_summarized_delete_documents_by_batch() { @r#" { "uid": 0, + "progress": null, "details": { "providedIds": 3, "deletedDocuments": 0 @@ -379,6 +382,7 @@ async fn test_summarized_delete_documents_by_batch() { @r#" { "uid": 2, + "progress": null, "details": { "providedIds": 1, "deletedDocuments": 0 @@ -415,6 +419,7 @@ async fn test_summarized_delete_documents_by_filter() { @r#" { "uid": 0, + "progress": null, "details": { "providedIds": 0, "deletedDocuments": 0, @@ -447,6 +452,7 @@ async fn test_summarized_delete_documents_by_filter() { @r#" { "uid": 2, + "progress": null, "details": { "providedIds": 0, "deletedDocuments": 0, @@ -479,6 +485,7 @@ async fn test_summarized_delete_documents_by_filter() { @r#" { "uid": 4, + "progress": null, "details": { "providedIds": 0, "deletedDocuments": 0, @@ -515,6 +522,7 @@ async fn test_summarized_delete_document_by_id() { @r#" { "uid": 0, + "progress": null, "details": { "providedIds": 1, "deletedDocuments": 0 @@ -546,6 +554,7 @@ async fn test_summarized_delete_document_by_id() { @r#" { "uid": 2, + "progress": null, "details": { "providedIds": 1, "deletedDocuments": 0 @@ -593,6 +602,7 @@ async fn test_summarized_settings_update() { @r#" { "uid": 0, + "progress": null, "details": { "displayedAttributes": [ "doggos", @@ -637,6 +647,7 @@ async fn test_summarized_index_creation() { @r#" { "uid": 0, + "progress": null, "details": {}, "stats": { "totalNbTasks": 1, @@ -664,6 +675,7 @@ async fn test_summarized_index_creation() { @r#" { "uid": 1, + "progress": null, "details": { "primaryKey": "doggos" }, @@ -808,6 +820,7 @@ async fn test_summarized_index_update() { @r#" { "uid": 0, + "progress": null, "details": {}, "stats": { "totalNbTasks": 1, @@ -835,6 +848,7 @@ async fn test_summarized_index_update() { @r#" { "uid": 1, + "progress": null, "details": { "primaryKey": "bones" }, @@ -867,6 +881,7 @@ async fn test_summarized_index_update() { @r#" { "uid": 3, + "progress": null, "details": {}, "stats": { "totalNbTasks": 1, @@ -894,6 +909,7 @@ async fn test_summarized_index_update() { @r#" { "uid": 4, + "progress": null, "details": { "primaryKey": "bones" }, @@ -931,6 +947,7 @@ async fn test_summarized_index_swap() { @r#" { "uid": 0, + "progress": null, "details": { "swaps": [ { @@ -971,6 +988,7 @@ async fn test_summarized_index_swap() { @r#" { "uid": 3, + "progress": null, "details": { "swaps": [ { @@ -1013,6 +1031,7 @@ async fn test_summarized_batch_cancelation() { @r#" { "uid": 1, + "progress": null, "details": { "matchedTasks": 1, "canceledTasks": 0, @@ -1050,6 +1069,7 @@ async fn test_summarized_batch_deletion() { @r#" { "uid": 1, + "progress": null, "details": { "matchedTasks": 1, "deletedTasks": 1, @@ -1083,6 +1103,7 @@ async fn test_summarized_dump_creation() { @r#" { "uid": 0, + "progress": null, "details": { "dumpUid": "[dumpUid]" }, diff --git a/crates/meilisearch/tests/common/mod.rs b/crates/meilisearch/tests/common/mod.rs index 3aae2fe80..44385752e 100644 --- a/crates/meilisearch/tests/common/mod.rs +++ b/crates/meilisearch/tests/common/mod.rs @@ -52,6 +52,25 @@ impl Value { } self } + + /// Return `true` if the `status` field is set to `failed`. + /// Panic if the `status` field doesn't exists. + #[track_caller] + pub fn is_fail(&self) -> bool { + if !self["status"].is_string() { + panic!("Called `is_fail` on {}", serde_json::to_string_pretty(&self.0).unwrap()); + } + self["status"] == serde_json::Value::String(String::from("failed")) + } + + // Panic if the json doesn't contain the `status` field set to "succeeded" + #[track_caller] + pub fn failed(&self) -> &Self { + if !self.is_fail() { + panic!("Called failed on {}", serde_json::to_string_pretty(&self.0).unwrap()); + } + self + } } impl From for Value { diff --git a/crates/meilisearch/tests/documents/add_documents.rs b/crates/meilisearch/tests/documents/add_documents.rs index eebc5dc63..d72b1a7a8 100644 --- a/crates/meilisearch/tests/documents/add_documents.rs +++ b/crates/meilisearch/tests/documents/add_documents.rs @@ -1264,15 +1264,18 @@ async fn error_add_documents_bad_document_id() { let server = Server::new().await; let index = server.index("test"); index.create(Some("docid")).await; + + // unsupported characters + let documents = json!([ { "docid": "foo & bar", "content": "foobar" } ]); - index.add_documents(documents, None).await; - index.wait_task(1).await; - let (response, code) = index.get_task(1).await; + let (value, _code) = index.add_documents(documents, None).await; + index.wait_task(value.uid()).await; + let (response, code) = index.get_task(value.uid()).await; snapshot!(code, @"200 OK"); snapshot!(json_string!(response, { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }), @r###" @@ -1288,7 +1291,81 @@ async fn error_add_documents_bad_document_id() { "indexedDocuments": 0 }, "error": { - "message": "Document identifier `\"foo & bar\"` is invalid. A document identifier can be of type integer or string, only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_), and can not be more than 512 bytes.", + "message": "Document identifier `\"foo & bar\"` is invalid. A document identifier can be of type integer or string, only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_), and can not be more than 511 bytes.", + "code": "invalid_document_id", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_document_id" + }, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); + + // More than 512 bytes + let documents = json!([ + { + "docid": "a".repeat(600), + "content": "foobar" + } + ]); + let (value, _code) = index.add_documents(documents, None).await; + index.wait_task(value.uid()).await; + let (response, code) = index.get_task(value.uid()).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response, { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }), + @r###" + { + "uid": 2, + "batchUid": 2, + "indexUid": "test", + "status": "failed", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 0 + }, + "error": { + "message": "Document identifier `\"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\"` is invalid. A document identifier can be of type integer or string, only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_), and can not be more than 511 bytes.", + "code": "invalid_document_id", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_document_id" + }, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); + + // Exactly 512 bytes + let documents = json!([ + { + "docid": "a".repeat(512), + "content": "foobar" + } + ]); + let (value, _code) = index.add_documents(documents, None).await; + index.wait_task(value.uid()).await; + let (response, code) = index.get_task(value.uid()).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response, { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }), + @r###" + { + "uid": 3, + "batchUid": 3, + "indexUid": "test", + "status": "failed", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 0 + }, + "error": { + "message": "Document identifier `\"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\"` is invalid. A document identifier can be of type integer or string, only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_), and can not be more than 511 bytes.", "code": "invalid_document_id", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_document_id" @@ -1681,7 +1758,7 @@ async fn add_documents_invalid_geo_field() { "indexedDocuments": 0 }, "error": { - "message": "The `_geo` field in the document with the id: `\"11\"` is not an object. Was expecting an object with the `_geo.lat` and `_geo.lng` fields but instead got `\"foobar\"`.", + "message": "Index `test`: The `_geo` field in the document with the id: `\"11\"` is not an object. Was expecting an object with the `_geo.lat` and `_geo.lng` fields but instead got `\"foobar\"`.", "code": "invalid_document_geo_field", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_document_geo_field" @@ -1719,7 +1796,7 @@ async fn add_documents_invalid_geo_field() { "indexedDocuments": 0 }, "error": { - "message": "Could not find latitude nor longitude in the document with the id: `\"11\"`. Was expecting `_geo.lat` and `_geo.lng` fields.", + "message": "Index `test`: Could not find latitude nor longitude in the document with the id: `\"11\"`. Was expecting `_geo.lat` and `_geo.lng` fields.", "code": "invalid_document_geo_field", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_document_geo_field" @@ -1757,7 +1834,7 @@ async fn add_documents_invalid_geo_field() { "indexedDocuments": 0 }, "error": { - "message": "Could not find latitude nor longitude in the document with the id: `\"11\"`. Was expecting `_geo.lat` and `_geo.lng` fields.", + "message": "Index `test`: Could not find latitude nor longitude in the document with the id: `\"11\"`. Was expecting `_geo.lat` and `_geo.lng` fields.", "code": "invalid_document_geo_field", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_document_geo_field" @@ -1795,7 +1872,7 @@ async fn add_documents_invalid_geo_field() { "indexedDocuments": 0 }, "error": { - "message": "Could not find longitude in the document with the id: `\"11\"`. Was expecting a `_geo.lng` field.", + "message": "Index `test`: Could not find longitude in the document with the id: `\"11\"`. Was expecting a `_geo.lng` field.", "code": "invalid_document_geo_field", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_document_geo_field" @@ -1833,7 +1910,7 @@ async fn add_documents_invalid_geo_field() { "indexedDocuments": 0 }, "error": { - "message": "Could not find latitude in the document with the id: `\"11\"`. Was expecting a `_geo.lat` field.", + "message": "Index `test`: Could not find latitude in the document with the id: `\"11\"`. Was expecting a `_geo.lat` field.", "code": "invalid_document_geo_field", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_document_geo_field" @@ -1871,7 +1948,7 @@ async fn add_documents_invalid_geo_field() { "indexedDocuments": 0 }, "error": { - "message": "Could not find longitude in the document with the id: `\"11\"`. Was expecting a `_geo.lng` field.", + "message": "Index `test`: Could not find longitude in the document with the id: `\"11\"`. Was expecting a `_geo.lng` field.", "code": "invalid_document_geo_field", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_document_geo_field" @@ -1909,7 +1986,7 @@ async fn add_documents_invalid_geo_field() { "indexedDocuments": 0 }, "error": { - "message": "Could not find latitude in the document with the id: `\"11\"`. Was expecting a `_geo.lat` field.", + "message": "Index `test`: Could not find latitude in the document with the id: `\"11\"`. Was expecting a `_geo.lat` field.", "code": "invalid_document_geo_field", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_document_geo_field" @@ -1947,7 +2024,7 @@ async fn add_documents_invalid_geo_field() { "indexedDocuments": 0 }, "error": { - "message": "Could not parse latitude nor longitude in the document with the id: `\"11\"`. Was expecting finite numbers but instead got `false` and `true`.", + "message": "Index `test`: Could not parse latitude nor longitude in the document with the id: `\"11\"`. Was expecting finite numbers but instead got `false` and `true`.", "code": "invalid_document_geo_field", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_document_geo_field" @@ -1985,7 +2062,7 @@ async fn add_documents_invalid_geo_field() { "indexedDocuments": 0 }, "error": { - "message": "Could not find longitude in the document with the id: `\"11\"`. Was expecting a `_geo.lng` field.", + "message": "Index `test`: Could not find longitude in the document with the id: `\"11\"`. Was expecting a `_geo.lng` field.", "code": "invalid_document_geo_field", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_document_geo_field" @@ -2023,7 +2100,7 @@ async fn add_documents_invalid_geo_field() { "indexedDocuments": 0 }, "error": { - "message": "Could not find latitude in the document with the id: `\"11\"`. Was expecting a `_geo.lat` field.", + "message": "Index `test`: Could not find latitude in the document with the id: `\"11\"`. Was expecting a `_geo.lat` field.", "code": "invalid_document_geo_field", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_document_geo_field" @@ -2061,7 +2138,7 @@ async fn add_documents_invalid_geo_field() { "indexedDocuments": 0 }, "error": { - "message": "Could not parse latitude nor longitude in the document with the id: `\"11\"`. Was expecting finite numbers but instead got `\"doggo\"` and `\"doggo\"`.", + "message": "Index `test`: Could not parse latitude nor longitude in the document with the id: `\"11\"`. Was expecting finite numbers but instead got `\"doggo\"` and `\"doggo\"`.", "code": "invalid_document_geo_field", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_document_geo_field" @@ -2099,7 +2176,7 @@ async fn add_documents_invalid_geo_field() { "indexedDocuments": 0 }, "error": { - "message": "The `_geo` field in the document with the id: `\"11\"` contains the following unexpected fields: `{\"doggo\":\"are the best\"}`.", + "message": "Index `test`: The `_geo` field in the document with the id: `\"11\"` contains the following unexpected fields: `{\"doggo\":\"are the best\"}`.", "code": "invalid_document_geo_field", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_document_geo_field" @@ -2138,7 +2215,7 @@ async fn add_documents_invalid_geo_field() { "indexedDocuments": 0 }, "error": { - "message": "Could not parse longitude in the document with the id: `\"12\"`. Was expecting a finite number but instead got `null`.", + "message": "Index `test`: Could not parse longitude in the document with the id: `\"12\"`. Was expecting a finite number but instead got `null`.", "code": "invalid_document_geo_field", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_document_geo_field" @@ -2175,7 +2252,7 @@ async fn add_documents_invalid_geo_field() { "indexedDocuments": 0 }, "error": { - "message": "Could not parse latitude in the document with the id: `\"12\"`. Was expecting a finite number but instead got `null`.", + "message": "Index `test`: Could not parse latitude in the document with the id: `\"12\"`. Was expecting a finite number but instead got `null`.", "code": "invalid_document_geo_field", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_document_geo_field" @@ -2212,7 +2289,7 @@ async fn add_documents_invalid_geo_field() { "indexedDocuments": 0 }, "error": { - "message": "Could not parse latitude nor longitude in the document with the id: `\"13\"`. Was expecting finite numbers but instead got `null` and `null`.", + "message": "Index `test`: Could not parse latitude nor longitude in the document with the id: `\"13\"`. Was expecting finite numbers but instead got `null` and `null`.", "code": "invalid_document_geo_field", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_document_geo_field" @@ -2279,7 +2356,7 @@ async fn add_invalid_geo_and_then_settings() { ] }, "error": { - "message": "Could not parse latitude in the document with the id: `\"11\"`. Was expecting a finite number but instead got `null`.", + "message": "Index `test`: Could not parse latitude in the document with the id: `\"11\"`. Was expecting a finite number but instead got `null`.", "code": "invalid_document_geo_field", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_document_geo_field" diff --git a/crates/meilisearch/tests/documents/errors.rs b/crates/meilisearch/tests/documents/errors.rs index c90b9ed49..1e361fefb 100644 --- a/crates/meilisearch/tests/documents/errors.rs +++ b/crates/meilisearch/tests/documents/errors.rs @@ -604,7 +604,7 @@ async fn delete_document_by_filter() { "originalFilter": "\"doggo = bernese\"" }, "error": { - "message": "Attribute `doggo` is not filterable. This index does not have configured filterable attributes.\n1:6 doggo = bernese", + "message": "Index `EMPTY_INDEX`: Attribute `doggo` is not filterable. This index does not have configured filterable attributes.\n1:6 doggo = bernese", "code": "invalid_document_filter", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_document_filter" @@ -636,7 +636,7 @@ async fn delete_document_by_filter() { "originalFilter": "\"catto = jorts\"" }, "error": { - "message": "Attribute `catto` is not filterable. Available filterable attributes are: `id`, `title`.\n1:6 catto = jorts", + "message": "Index `SHARED_DOCUMENTS`: Attribute `catto` is not filterable. Available filterable attributes are: `id`, `title`.\n1:6 catto = jorts", "code": "invalid_document_filter", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_document_filter" diff --git a/crates/meilisearch/tests/documents/update_documents.rs b/crates/meilisearch/tests/documents/update_documents.rs index c0703e81b..aaf529ce5 100644 --- a/crates/meilisearch/tests/documents/update_documents.rs +++ b/crates/meilisearch/tests/documents/update_documents.rs @@ -172,7 +172,7 @@ async fn error_update_documents_bad_document_id() { assert_eq!( response["error"]["message"], json!( - r#"Document identifier `"foo & bar"` is invalid. A document identifier can be of type integer or string, only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_), and can not be more than 512 bytes."# + r#"Document identifier `"foo & bar"` is invalid. A document identifier can be of type integer or string, only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_), and can not be more than 511 bytes."# ) ); assert_eq!(response["error"]["code"], json!("invalid_document_id")); diff --git a/crates/meilisearch/tests/dumps/mod.rs b/crates/meilisearch/tests/dumps/mod.rs index c7d157b00..dbbd1abf0 100644 --- a/crates/meilisearch/tests/dumps/mod.rs +++ b/crates/meilisearch/tests/dumps/mod.rs @@ -79,7 +79,9 @@ async fn import_dump_v1_movie_raw() { "maxTotalHits": 1000 }, "searchCutoffMs": null, - "localizedAttributes": null + "localizedAttributes": null, + "facetSearch": true, + "prefixSearch": "indexingTime" } "### ); @@ -242,7 +244,9 @@ async fn import_dump_v1_movie_with_settings() { "maxTotalHits": 1000 }, "searchCutoffMs": null, - "localizedAttributes": null + "localizedAttributes": null, + "facetSearch": true, + "prefixSearch": "indexingTime" } "### ); @@ -391,7 +395,9 @@ async fn import_dump_v1_rubygems_with_settings() { "maxTotalHits": 1000 }, "searchCutoffMs": null, - "localizedAttributes": null + "localizedAttributes": null, + "facetSearch": true, + "prefixSearch": "indexingTime" } "### ); @@ -526,7 +532,9 @@ async fn import_dump_v2_movie_raw() { "maxTotalHits": 1000 }, "searchCutoffMs": null, - "localizedAttributes": null + "localizedAttributes": null, + "facetSearch": true, + "prefixSearch": "indexingTime" } "### ); @@ -673,7 +681,9 @@ async fn import_dump_v2_movie_with_settings() { "maxTotalHits": 1000 }, "searchCutoffMs": null, - "localizedAttributes": null + "localizedAttributes": null, + "facetSearch": true, + "prefixSearch": "indexingTime" } "### ); @@ -819,7 +829,9 @@ async fn import_dump_v2_rubygems_with_settings() { "maxTotalHits": 1000 }, "searchCutoffMs": null, - "localizedAttributes": null + "localizedAttributes": null, + "facetSearch": true, + "prefixSearch": "indexingTime" } "### ); @@ -954,7 +966,9 @@ async fn import_dump_v3_movie_raw() { "maxTotalHits": 1000 }, "searchCutoffMs": null, - "localizedAttributes": null + "localizedAttributes": null, + "facetSearch": true, + "prefixSearch": "indexingTime" } "### ); @@ -1101,7 +1115,9 @@ async fn import_dump_v3_movie_with_settings() { "maxTotalHits": 1000 }, "searchCutoffMs": null, - "localizedAttributes": null + "localizedAttributes": null, + "facetSearch": true, + "prefixSearch": "indexingTime" } "### ); @@ -1247,7 +1263,9 @@ async fn import_dump_v3_rubygems_with_settings() { "maxTotalHits": 1000 }, "searchCutoffMs": null, - "localizedAttributes": null + "localizedAttributes": null, + "facetSearch": true, + "prefixSearch": "indexingTime" } "### ); @@ -1382,7 +1400,9 @@ async fn import_dump_v4_movie_raw() { "maxTotalHits": 1000 }, "searchCutoffMs": null, - "localizedAttributes": null + "localizedAttributes": null, + "facetSearch": true, + "prefixSearch": "indexingTime" } "### ); @@ -1529,7 +1549,9 @@ async fn import_dump_v4_movie_with_settings() { "maxTotalHits": 1000 }, "searchCutoffMs": null, - "localizedAttributes": null + "localizedAttributes": null, + "facetSearch": true, + "prefixSearch": "indexingTime" } "### ); @@ -1675,7 +1697,9 @@ async fn import_dump_v4_rubygems_with_settings() { "maxTotalHits": 1000 }, "searchCutoffMs": null, - "localizedAttributes": null + "localizedAttributes": null, + "facetSearch": true, + "prefixSearch": "indexingTime" } "### ); @@ -1922,7 +1946,9 @@ async fn import_dump_v6_containing_experimental_features() { "maxTotalHits": 1000 }, "searchCutoffMs": null, - "localizedAttributes": null + "localizedAttributes": null, + "facetSearch": true, + "prefixSearch": "indexingTime" } "###); @@ -2102,7 +2128,9 @@ async fn generate_and_import_dump_containing_vectors() { } }, "searchCutoffMs": null, - "localizedAttributes": null + "localizedAttributes": null, + "facetSearch": true, + "prefixSearch": "indexingTime" } "###); diff --git a/crates/meilisearch/tests/index/update_index.rs b/crates/meilisearch/tests/index/update_index.rs index 36ec27306..f991c3580 100644 --- a/crates/meilisearch/tests/index/update_index.rs +++ b/crates/meilisearch/tests/index/update_index.rs @@ -95,7 +95,7 @@ async fn error_update_existing_primary_key() { let response = index.wait_task(2).await; let expected_response = json!({ - "message": "Index already has a primary key: `id`.", + "message": "Index `test`: Index already has a primary key: `id`.", "code": "index_primary_key_already_exists", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#index_primary_key_already_exists" diff --git a/crates/meilisearch/tests/search/errors.rs b/crates/meilisearch/tests/search/errors.rs index 6840f8fba..ab50e2aa1 100644 --- a/crates/meilisearch/tests/search/errors.rs +++ b/crates/meilisearch/tests/search/errors.rs @@ -711,7 +711,7 @@ async fn filter_invalid_attribute_array() { index.wait_task(task.uid()).await; let expected_response = json!({ - "message": "Attribute `many` is not filterable. Available filterable attributes are: `title`.\n1:5 many = Glass", + "message": format!("Index `{}`: Attribute `many` is not filterable. Available filterable attributes are: `title`.\n1:5 many = Glass", index.uid), "code": "invalid_search_filter", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_search_filter" @@ -733,7 +733,7 @@ async fn filter_invalid_attribute_string() { index.wait_task(task.uid()).await; let expected_response = json!({ - "message": "Attribute `many` is not filterable. Available filterable attributes are: `title`.\n1:5 many = Glass", + "message": format!("Index `{}`: Attribute `many` is not filterable. Available filterable attributes are: `title`.\n1:5 many = Glass", index.uid), "code": "invalid_search_filter", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_search_filter" @@ -940,7 +940,7 @@ async fn sort_unsortable_attribute() { index.wait_task(response.uid()).await.succeeded(); let expected_response = json!({ - "message": "Attribute `title` is not sortable. Available sortable attributes are: `id`.", + "message": format!("Index `{}`: Attribute `title` is not sortable. Available sortable attributes are: `id`.", index.uid), "code": "invalid_search_sort", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_search_sort" @@ -998,7 +998,7 @@ async fn sort_unset_ranking_rule() { index.wait_task(response.uid()).await.succeeded(); let expected_response = json!({ - "message": "You must specify where `sort` is listed in the rankingRules setting to use the sort parameter at search time.", + "message": format!("Index `{}`: You must specify where `sort` is listed in the rankingRules setting to use the sort parameter at search time.", index.uid), "code": "invalid_search_sort", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_search_sort" @@ -1024,19 +1024,18 @@ async fn search_on_unknown_field() { index.update_settings_searchable_attributes(json!(["id", "title"])).await; index.wait_task(response.uid()).await.succeeded(); + let expected_response = json!({ + "message": format!("Index `{}`: Attribute `unknown` is not searchable. Available searchable attributes are: `id, title`.", index.uid), + "code": "invalid_search_attributes_to_search_on", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_attributes_to_search_on" + }); index .search( json!({"q": "Captain Marvel", "attributesToSearchOn": ["unknown"]}), |response, code| { - snapshot!(code, @"400 Bad Request"); - snapshot!(json_string!(response), @r###" - { - "message": "Attribute `unknown` is not searchable. Available searchable attributes are: `id, title`.", - "code": "invalid_search_attributes_to_search_on", - "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#invalid_search_attributes_to_search_on" - } - "###); + assert_eq!(response, expected_response); + assert_eq!(code, 400); }, ) .await; @@ -1050,19 +1049,18 @@ async fn search_on_unknown_field_plus_joker() { index.update_settings_searchable_attributes(json!(["id", "title"])).await; index.wait_task(response.uid()).await.succeeded(); + let expected_response = json!({ + "message": format!("Index `{}`: Attribute `unknown` is not searchable. Available searchable attributes are: `id, title`.", index.uid), + "code": "invalid_search_attributes_to_search_on", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_attributes_to_search_on" + }); index .search( json!({"q": "Captain Marvel", "attributesToSearchOn": ["*", "unknown"]}), |response, code| { - snapshot!(code, @"400 Bad Request"); - snapshot!(json_string!(response), @r###" - { - "message": "Attribute `unknown` is not searchable. Available searchable attributes are: `id, title`.", - "code": "invalid_search_attributes_to_search_on", - "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#invalid_search_attributes_to_search_on" - } - "###); + assert_eq!(response, expected_response); + assert_eq!(code, 400); }, ) .await; @@ -1071,15 +1069,8 @@ async fn search_on_unknown_field_plus_joker() { .search( json!({"q": "Captain Marvel", "attributesToSearchOn": ["unknown", "*"]}), |response, code| { - snapshot!(code, @"400 Bad Request"); - snapshot!(json_string!(response), @r###" - { - "message": "Attribute `unknown` is not searchable. Available searchable attributes are: `id, title`.", - "code": "invalid_search_attributes_to_search_on", - "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#invalid_search_attributes_to_search_on" - } - "###); + assert_eq!(response, expected_response); + assert_eq!(code, 400); }, ) .await; @@ -1092,47 +1083,44 @@ async fn distinct_at_search_time() { let (task, _) = index.create(None).await; index.wait_task(task.uid()).await.succeeded(); + let expected_response = json!({ + "message": format!("Index `{}`: Attribute `doggo.truc` is not filterable and thus, cannot be used as distinct attribute. This index does not have configured filterable attributes.", index.uid), + "code": "invalid_search_distinct", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_distinct" + }); let (response, code) = index.search_post(json!({"page": 0, "hitsPerPage": 2, "distinct": "doggo.truc"})).await; - snapshot!(code, @"400 Bad Request"); - snapshot!(response, @r###" - { - "message": "Attribute `doggo.truc` is not filterable and thus, cannot be used as distinct attribute. This index does not have configured filterable attributes.", - "code": "invalid_search_distinct", - "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#invalid_search_distinct" - } - "###); + assert_eq!(response, expected_response); + assert_eq!(code, 400); let (task, _) = index.update_settings_filterable_attributes(json!(["color", "machin"])).await; index.wait_task(task.uid()).await; + let expected_response = json!({ + "message": format!("Index `{}`: Attribute `doggo.truc` is not filterable and thus, cannot be used as distinct attribute. Available filterable attributes are: `color, machin`.", index.uid), + "code": "invalid_search_distinct", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_distinct" + }); let (response, code) = index.search_post(json!({"page": 0, "hitsPerPage": 2, "distinct": "doggo.truc"})).await; - snapshot!(code, @"400 Bad Request"); - snapshot!(response, @r###" - { - "message": "Attribute `doggo.truc` is not filterable and thus, cannot be used as distinct attribute. Available filterable attributes are: `color, machin`.", - "code": "invalid_search_distinct", - "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#invalid_search_distinct" - } - "###); + assert_eq!(response, expected_response); + assert_eq!(code, 400); let (task, _) = index.update_settings_displayed_attributes(json!(["color"])).await; index.wait_task(task.uid()).await; + let expected_response = json!({ + "message": format!("Index `{}`: Attribute `doggo.truc` is not filterable and thus, cannot be used as distinct attribute. Available filterable attributes are: `color, <..hidden-attributes>`.", index.uid), + "code": "invalid_search_distinct", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_distinct" + }); let (response, code) = index.search_post(json!({"page": 0, "hitsPerPage": 2, "distinct": "doggo.truc"})).await; - snapshot!(code, @"400 Bad Request"); - snapshot!(response, @r###" - { - "message": "Attribute `doggo.truc` is not filterable and thus, cannot be used as distinct attribute. Available filterable attributes are: `color, <..hidden-attributes>`.", - "code": "invalid_search_distinct", - "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#invalid_search_distinct" - } - "###); + assert_eq!(response, expected_response); + assert_eq!(code, 400); let (response, code) = index.search_post(json!({"page": 0, "hitsPerPage": 2, "distinct": true})).await; diff --git a/crates/meilisearch/tests/search/facet_search.rs b/crates/meilisearch/tests/search/facet_search.rs index 12d2226a9..696c23f91 100644 --- a/crates/meilisearch/tests/search/facet_search.rs +++ b/crates/meilisearch/tests/search/facet_search.rs @@ -41,8 +41,8 @@ async fn simple_facet_search() { let documents = DOCUMENTS.clone(); index.update_settings_filterable_attributes(json!(["genres"])).await; - index.add_documents(documents, None).await; - index.wait_task(1).await; + let (response, _code) = index.add_documents(documents, None).await; + index.wait_task(response.uid()).await; let (response, code) = index.facet_search(json!({"facetName": "genres", "facetQuery": "a"})).await; @@ -57,6 +57,116 @@ async fn simple_facet_search() { assert_eq!(response["facetHits"].as_array().unwrap().len(), 1); } +#[actix_rt::test] +async fn simple_facet_search_on_movies() { + let server = Server::new().await; + let index = server.index("test"); + + let documents = json!([ + { + "id": 1, + "title": "Carol", + "genres": [ + "Romance", + "Drama" + ], + "color": [ + "red" + ], + "platforms": [ + "MacOS", + "Linux", + "Windows" + ] + }, + { + "id": 2, + "title": "Wonder Woman", + "genres": [ + "Action", + "Adventure" + ], + "color": [ + "green" + ], + "platforms": [ + "MacOS" + ] + }, + { + "id": 3, + "title": "Life of Pi", + "genres": [ + "Adventure", + "Drama" + ], + "color": [ + "blue" + ], + "platforms": [ + "Windows" + ] + }, + { + "id": 4, + "title": "Mad Max: Fury Road", + "genres": [ + "Adventure", + "Science Fiction" + ], + "color": [ + "red" + ], + "platforms": [ + "MacOS", + "Linux" + ] + }, + { + "id": 5, + "title": "Moana", + "genres": [ + "Fantasy", + "Action" + ], + "color": [ + "red" + ], + "platforms": [ + "Windows" + ] + }, + { + "id": 6, + "title": "Philadelphia", + "genres": [ + "Drama" + ], + "color": [ + "blue" + ], + "platforms": [ + "MacOS", + "Linux", + "Windows" + ] + } + ]); + let (response, code) = + index.update_settings_filterable_attributes(json!(["genres", "color"])).await; + assert_eq!(202, code, "{:?}", response); + index.wait_task(response.uid()).await; + + let (response, _code) = index.add_documents(documents, None).await; + index.wait_task(response.uid()).await; + + let (response, code) = + index.facet_search(json!({"facetQuery": "", "facetName": "genres", "q": "" })).await; + + assert_eq!(code, 200, "{}", response); + snapshot!(response["facetHits"], @r###"[{"value":"Action","count":2},{"value":"Adventure","count":3},{"value":"Drama","count":3},{"value":"Fantasy","count":1},{"value":"Romance","count":1},{"value":"Science Fiction","count":1}]"###); +} + #[actix_rt::test] async fn advanced_facet_search() { let server = Server::new().await; @@ -65,8 +175,8 @@ async fn advanced_facet_search() { let documents = DOCUMENTS.clone(); index.update_settings_filterable_attributes(json!(["genres"])).await; index.update_settings_typo_tolerance(json!({ "enabled": false })).await; - index.add_documents(documents, None).await; - index.wait_task(2).await; + let (response, _code) = index.add_documents(documents, None).await; + index.wait_task(response.uid()).await; let (response, code) = index.facet_search(json!({"facetName": "genres", "facetQuery": "adventre"})).await; @@ -89,8 +199,8 @@ async fn more_advanced_facet_search() { let documents = DOCUMENTS.clone(); index.update_settings_filterable_attributes(json!(["genres"])).await; index.update_settings_typo_tolerance(json!({ "disableOnWords": ["adventre"] })).await; - index.add_documents(documents, None).await; - index.wait_task(2).await; + let (response, _code) = index.add_documents(documents, None).await; + index.wait_task(response.uid()).await; let (response, code) = index.facet_search(json!({"facetName": "genres", "facetQuery": "adventre"})).await; @@ -113,8 +223,8 @@ async fn simple_facet_search_with_max_values() { let documents = DOCUMENTS.clone(); index.update_settings_faceting(json!({ "maxValuesPerFacet": 1 })).await; index.update_settings_filterable_attributes(json!(["genres"])).await; - index.add_documents(documents, None).await; - index.wait_task(2).await; + let (response, _code) = index.add_documents(documents, None).await; + index.wait_task(response.uid()).await; let (response, code) = index.facet_search(json!({"facetName": "genres", "facetQuery": "a"})).await; @@ -135,8 +245,8 @@ async fn simple_facet_search_by_count_with_max_values() { ) .await; index.update_settings_filterable_attributes(json!(["genres"])).await; - index.add_documents(documents, None).await; - index.wait_task(2).await; + let (response, _code) = index.add_documents(documents, None).await; + index.wait_task(response.uid()).await; let (response, code) = index.facet_search(json!({"facetName": "genres", "facetQuery": "a"})).await; @@ -151,8 +261,8 @@ async fn non_filterable_facet_search_error() { let index = server.index("test"); let documents = DOCUMENTS.clone(); - index.add_documents(documents, None).await; - index.wait_task(0).await; + let (response, _code) = index.add_documents(documents, None).await; + index.wait_task(response.uid()).await; let (response, code) = index.facet_search(json!({"facetName": "genres", "facetQuery": "a"})).await; @@ -170,8 +280,8 @@ async fn facet_search_dont_support_words() { let documents = DOCUMENTS.clone(); index.update_settings_filterable_attributes(json!(["genres"])).await; - index.add_documents(documents, None).await; - index.wait_task(1).await; + let (response, _code) = index.add_documents(documents, None).await; + index.wait_task(response.uid()).await; let (response, code) = index.facet_search(json!({"facetName": "genres", "facetQuery": "words"})).await; @@ -188,8 +298,8 @@ async fn simple_facet_search_with_sort_by_count() { let documents = DOCUMENTS.clone(); index.update_settings_faceting(json!({ "sortFacetValuesBy": { "*": "count" } })).await; index.update_settings_filterable_attributes(json!(["genres"])).await; - index.add_documents(documents, None).await; - index.wait_task(2).await; + let (response, _code) = index.add_documents(documents, None).await; + index.wait_task(response.uid()).await; let (response, code) = index.facet_search(json!({"facetName": "genres", "facetQuery": "a"})).await; @@ -200,3 +310,129 @@ async fn simple_facet_search_with_sort_by_count() { assert_eq!(hits[0], json!({ "value": "Action", "count": 3 })); assert_eq!(hits[1], json!({ "value": "Adventure", "count": 2 })); } + +#[actix_rt::test] +async fn add_documents_and_deactivate_facet_search() { + let server = Server::new().await; + let index = server.index("test"); + + let documents = DOCUMENTS.clone(); + let (response, _code) = index.add_documents(documents, None).await; + index.wait_task(response.uid()).await; + let (response, code) = index + .update_settings(json!({ + "facetSearch": false, + "filterableAttributes": ["genres"], + })) + .await; + assert_eq!("202", code.as_str(), "{:?}", response); + index.wait_task(response.uid()).await; + + let (response, code) = + index.facet_search(json!({"facetName": "genres", "facetQuery": "a"})).await; + + assert_eq!(code, 400, "{}", response); + snapshot!(response, @r###" + { + "message": "The facet search is disabled for this index", + "code": "facet_search_disabled", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#facet_search_disabled" + } + "###); +} + +#[actix_rt::test] +async fn deactivate_facet_search_and_add_documents() { + let server = Server::new().await; + let index = server.index("test"); + + let (response, code) = index + .update_settings(json!({ + "facetSearch": false, + "filterableAttributes": ["genres"], + })) + .await; + assert_eq!("202", code.as_str(), "{:?}", response); + index.wait_task(response.uid()).await; + let documents = DOCUMENTS.clone(); + let (response, _code) = index.add_documents(documents, None).await; + index.wait_task(response.uid()).await; + + let (response, code) = + index.facet_search(json!({"facetName": "genres", "facetQuery": "a"})).await; + + assert_eq!(code, 400, "{}", response); + snapshot!(response, @r###" + { + "message": "The facet search is disabled for this index", + "code": "facet_search_disabled", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#facet_search_disabled" + } + "###); +} + +#[actix_rt::test] +async fn deactivate_facet_search_add_documents_and_activate_facet_search() { + let server = Server::new().await; + let index = server.index("test"); + + let (response, code) = index + .update_settings(json!({ + "facetSearch": false, + "filterableAttributes": ["genres"], + })) + .await; + assert_eq!("202", code.as_str(), "{:?}", response); + index.wait_task(response.uid()).await; + let documents = DOCUMENTS.clone(); + let (response, _code) = index.add_documents(documents, None).await; + index.wait_task(response.uid()).await; + + let (response, code) = index + .update_settings(json!({ + "facetSearch": true, + })) + .await; + assert_eq!("202", code.as_str(), "{:?}", response); + index.wait_task(response.uid()).await; + + let (response, code) = + index.facet_search(json!({"facetName": "genres", "facetQuery": "a"})).await; + + assert_eq!(code, 200, "{}", response); + assert_eq!(dbg!(response)["facetHits"].as_array().unwrap().len(), 2); +} + +#[actix_rt::test] +async fn deactivate_facet_search_add_documents_and_reset_facet_search() { + let server = Server::new().await; + let index = server.index("test"); + + let (response, code) = index + .update_settings(json!({ + "facetSearch": false, + "filterableAttributes": ["genres"], + })) + .await; + assert_eq!("202", code.as_str(), "{:?}", response); + index.wait_task(response.uid()).await; + let documents = DOCUMENTS.clone(); + let (response, _code) = index.add_documents(documents, None).await; + index.wait_task(response.uid()).await; + + let (response, code) = index + .update_settings(json!({ + "facetSearch": serde_json::Value::Null, + })) + .await; + assert_eq!("202", code.as_str(), "{:?}", response); + index.wait_task(response.uid()).await; + + let (response, code) = + index.facet_search(json!({"facetName": "genres", "facetQuery": "a"})).await; + + assert_eq!(code, 200, "{}", response); + assert_eq!(dbg!(response)["facetHits"].as_array().unwrap().len(), 2); +} diff --git a/crates/meilisearch/tests/search/formatted.rs b/crates/meilisearch/tests/search/formatted.rs index ee33939fd..5ded39976 100644 --- a/crates/meilisearch/tests/search/formatted.rs +++ b/crates/meilisearch/tests/search/formatted.rs @@ -4,6 +4,58 @@ use super::*; use crate::common::Server; use crate::json; +#[actix_rt::test] +async fn search_formatted_from_sdk() { + let server = Server::new_shared(); + let index = server.unique_index(); + + index + .update_settings( + json!({ "filterableAttributes": ["genre"], "searchableAttributes": ["title"] }), + ) + .await; + + let documents = json!([ + { "id": 123, "title": "Pride and Prejudice", "genre": "romance" }, + { "id": 456, "title": "Le Petit Prince", "genre": "adventure" }, + { "id": 1, "title": "Alice In Wonderland", "genre": "adventure" }, + { "id": 2, "title": "Le Rouge et le Noir", "genre": "romance" }, + { "id": 1344, "title": "The Hobbit", "genre": "adventure" }, + { "id": 4, "title": "Harry Potter and the Half-Blood Prince", "genre": "fantasy" }, + { "id": 7, "title": "Harry Potter and the Chamber of Secrets", "genre": "fantasy" }, + { "id": 42, "title": "The Hitchhiker's Guide to the Galaxy" } + ]); + let (response, _) = index.add_documents(documents, None).await; + index.wait_task(response.uid()).await; + + index + .search( + json!({ "q":"prince", + "attributesToCrop": ["title"], + "cropLength": 2, + "filter": "genre = adventure", + "attributesToHighlight": ["title"], + "attributesToRetrieve": ["title"] + }), + |response, code| { + assert_eq!(code, 200, "{}", response); + allow_duplicates! { + assert_json_snapshot!(response["hits"][0], + { "._rankingScore" => "[score]" }, + @r###" + { + "title": "Le Petit Prince", + "_formatted": { + "title": "…Petit Prince" + } + } + "###); + } + }, + ) + .await; +} + #[actix_rt::test] async fn formatted_contain_wildcard() { let server = Server::new_shared(); diff --git a/crates/meilisearch/tests/search/mod.rs b/crates/meilisearch/tests/search/mod.rs index 8cafe1dd8..057b2b3a2 100644 --- a/crates/meilisearch/tests/search/mod.rs +++ b/crates/meilisearch/tests/search/mod.rs @@ -15,6 +15,7 @@ mod pagination; mod restrict_searchable; mod search_queue; +use meili_snap::{json_string, snapshot}; use meilisearch::Opt; use tempfile::TempDir; @@ -62,6 +63,71 @@ async fn simple_search() { .await; } +#[actix_rt::test] +async fn search_with_stop_word() { + // related to https://github.com/meilisearch/meilisearch/issues/4984 + let server = Server::new().await; + let index = server.index("test"); + + let (_, code) = index + .update_settings(json!({"stopWords": ["the", "The", "a", "an", "to", "in", "of"]})) + .await; + meili_snap::snapshot!(code, @"202 Accepted"); + + let documents = DOCUMENTS.clone(); + index.add_documents(documents, None).await; + index.wait_task(1).await; + + // prefix search + index + .search(json!({"q": "to the", "attributesToHighlight": ["title"], "attributesToRetrieve": ["title"] }), |response, code| { + assert_eq!(code, 200, "{}", response); + snapshot!(json_string!(response["hits"]), @"[]"); + }) + .await; + + // non-prefix search + index + .search(json!({"q": "to the ", "attributesToHighlight": ["title"], "attributesToRetrieve": ["title"] }), |response, code| { + assert_eq!(code, 200, "{}", response); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "title": "Shazam!", + "_formatted": { + "title": "Shazam!" + } + }, + { + "title": "Captain Marvel", + "_formatted": { + "title": "Captain Marvel" + } + }, + { + "title": "Escape Room", + "_formatted": { + "title": "Escape Room" + } + }, + { + "title": "How to Train Your Dragon: The Hidden World", + "_formatted": { + "title": "How to Train Your Dragon: The Hidden World" + } + }, + { + "title": "Gläss", + "_formatted": { + "title": "Gläss" + } + } + ] + "###); + }) + .await; +} + #[actix_rt::test] async fn phrase_search_with_stop_word() { // related to https://github.com/meilisearch/meilisearch/issues/3521 diff --git a/crates/meilisearch/tests/search/multi.rs b/crates/meilisearch/tests/search/multi.rs index 8d7340f0d..9377f435a 100644 --- a/crates/meilisearch/tests/search/multi.rs +++ b/crates/meilisearch/tests/search/multi.rs @@ -1070,7 +1070,7 @@ async fn federation_one_query_error() { snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r###" { - "message": "Inside `.queries[1]`: Attribute `title` is not filterable. This index does not have configured filterable attributes.\n1:6 title = toto", + "message": "Inside `.queries[1]`: Index `nested`: Attribute `title` is not filterable. This index does not have configured filterable attributes.\n1:6 title = toto", "code": "invalid_search_filter", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_search_filter" @@ -1102,7 +1102,7 @@ async fn federation_one_query_sort_error() { snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r###" { - "message": "Inside `.queries[1]`: Attribute `doggos` is not sortable. This index does not have configured sortable attributes.", + "message": "Inside `.queries[1]`: Index `nested`: Attribute `doggos` is not sortable. This index does not have configured sortable attributes.", "code": "invalid_search_sort", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_search_sort" @@ -1166,7 +1166,7 @@ async fn federation_multiple_query_errors() { snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r###" { - "message": "Inside `.queries[0]`: Attribute `title` is not filterable. This index does not have configured filterable attributes.\n1:6 title = toto", + "message": "Inside `.queries[0]`: Index `test`: Attribute `title` is not filterable. This index does not have configured filterable attributes.\n1:6 title = toto", "code": "invalid_search_filter", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_search_filter" @@ -1198,7 +1198,7 @@ async fn federation_multiple_query_sort_errors() { snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r###" { - "message": "Inside `.queries[0]`: Attribute `title` is not sortable. This index does not have configured sortable attributes.", + "message": "Inside `.queries[0]`: Index `test`: Attribute `title` is not sortable. This index does not have configured sortable attributes.", "code": "invalid_search_sort", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_search_sort" @@ -1231,7 +1231,7 @@ async fn federation_multiple_query_errors_interleaved() { snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r###" { - "message": "Inside `.queries[1]`: Attribute `doggos` is not filterable. This index does not have configured filterable attributes.\n1:7 doggos IN [intel, kefir]", + "message": "Inside `.queries[1]`: Index `nested`: Attribute `doggos` is not filterable. This index does not have configured filterable attributes.\n1:7 doggos IN [intel, kefir]", "code": "invalid_search_filter", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_search_filter" @@ -1264,7 +1264,7 @@ async fn federation_multiple_query_sort_errors_interleaved() { snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r###" { - "message": "Inside `.queries[1]`: Attribute `doggos` is not sortable. This index does not have configured sortable attributes.", + "message": "Inside `.queries[1]`: Index `nested`: Attribute `doggos` is not sortable. This index does not have configured sortable attributes.", "code": "invalid_search_sort", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_search_sort" diff --git a/crates/meilisearch/tests/search/restrict_searchable.rs b/crates/meilisearch/tests/search/restrict_searchable.rs index ca659c518..abd13fadf 100644 --- a/crates/meilisearch/tests/search/restrict_searchable.rs +++ b/crates/meilisearch/tests/search/restrict_searchable.rs @@ -367,3 +367,50 @@ async fn search_on_exact_field() { }) .await; } + +#[actix_rt::test] +async fn phrase_search_on_title() { + let server = Server::new().await; + let documents = json!([ + { "id": 8, "desc": "Document Review", "title": "Document Review Specialist II" }, + { "id": 5, "desc": "Document Review", "title": "Document Review Attorney" }, + { "id": 4, "desc": "Document Review", "title": "Document Review Manager - Cyber Incident Response (Remote)" }, + { "id": 3, "desc": "Document Review", "title": "Document Review Paralegal" }, + { "id": 2, "desc": "Document Review", "title": "Document Controller (Saudi National)" }, + { "id": 1, "desc": "Document Review", "title": "Document Reviewer" }, + { "id": 7, "desc": "Document Review", "title": "Document Review Specialist II" }, + { "id": 6, "desc": "Document Review", "title": "Document Review (Entry Level)" } + ]); + let index = index_with_documents(&server, &documents).await; + + index + .search( + json!({"q": "\"Document Review\"", "attributesToSearchOn": ["title"], "attributesToRetrieve": ["title"]}), + |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "title": "Document Review Specialist II" + }, + { + "title": "Document Review Attorney" + }, + { + "title": "Document Review Manager - Cyber Incident Response (Remote)" + }, + { + "title": "Document Review Paralegal" + }, + { + "title": "Document Review Specialist II" + }, + { + "title": "Document Review (Entry Level)" + } + ] + "###); + }, + ) + .await; +} diff --git a/crates/meilisearch/tests/settings/get_settings.rs b/crates/meilisearch/tests/settings/get_settings.rs index 6de0db0b3..55d9441ee 100644 --- a/crates/meilisearch/tests/settings/get_settings.rs +++ b/crates/meilisearch/tests/settings/get_settings.rs @@ -1,44 +1,185 @@ -use std::collections::HashMap; - -use once_cell::sync::Lazy; - -use crate::common::{Server, Value}; +use crate::common::Server; use crate::json; -static DEFAULT_SETTINGS_VALUES: Lazy> = Lazy::new(|| { - let mut map = HashMap::new(); - map.insert("displayed_attributes", json!(["*"])); - map.insert("searchable_attributes", json!(["*"])); - map.insert("localized_attributes", json!(null)); - map.insert("filterable_attributes", json!([])); - map.insert("distinct_attribute", json!(null)); - map.insert( - "ranking_rules", - json!(["words", "typo", "proximity", "attribute", "sort", "exactness"]), - ); - map.insert("stop_words", json!([])); - map.insert("non_separator_tokens", json!([])); - map.insert("separator_tokens", json!([])); - map.insert("dictionary", json!([])); - map.insert("synonyms", json!({})); - map.insert( - "faceting", - json!({ - "maxValuesPerFacet": json!(100), - "sortFacetValuesBy": { - "*": "alpha" +macro_rules! test_setting_routes { + ($({setting: $setting:ident, update_verb: $update_verb:ident, default_value: $default_value:tt},) *) => { + $( + mod $setting { + use crate::common::Server; + + #[actix_rt::test] + async fn get_unexisting_index() { + let server = Server::new().await; + let url = format!("/indexes/test/settings/{}", + stringify!($setting) + .chars() + .map(|c| if c == '_' { '-' } else { c }) + .collect::()); + let (_response, code) = server.service.get(url).await; + assert_eq!(code, 404); + } + + #[actix_rt::test] + async fn update_unexisting_index() { + let server = Server::new().await; + let url = format!("/indexes/test/settings/{}", + stringify!($setting) + .chars() + .map(|c| if c == '_' { '-' } else { c }) + .collect::()); + let (response, code) = server.service.$update_verb(url, serde_json::Value::Null.into()).await; + assert_eq!(code, 202, "{}", response); + server.index("").wait_task(0).await; + let (response, code) = server.index("test").get().await; + assert_eq!(code, 200, "{}", response); + } + + #[actix_rt::test] + async fn delete_unexisting_index() { + let server = Server::new().await; + let url = format!("/indexes/test/settings/{}", + stringify!($setting) + .chars() + .map(|c| if c == '_' { '-' } else { c }) + .collect::()); + let (_, code) = server.service.delete(url).await; + assert_eq!(code, 202); + let response = server.index("").wait_task(0).await; + assert_eq!(response["status"], "failed"); + } + + #[actix_rt::test] + async fn get_default() { + let server = Server::new().await; + let index = server.index("test"); + let (response, code) = index.create(None).await; + assert_eq!(code, 202, "{}", response); + index.wait_task(0).await; + let url = format!("/indexes/test/settings/{}", + stringify!($setting) + .chars() + .map(|c| if c == '_' { '-' } else { c }) + .collect::()); + let (response, code) = server.service.get(url).await; + assert_eq!(code, 200, "{}", response); + let expected = crate::json!($default_value); + assert_eq!(expected, response); + } } - }), - ); - map.insert( - "pagination", - json!({ - "maxTotalHits": json!(1000), - }), - ); - map.insert("search_cutoff_ms", json!(null)); - map -}); + )* + + #[actix_rt::test] + async fn all_setting_tested() { + let expected = std::collections::BTreeSet::from_iter(meilisearch::routes::indexes::settings::ALL_SETTINGS_NAMES.iter()); + let tested = std::collections::BTreeSet::from_iter([$(stringify!($setting)),*].iter()); + let diff: Vec<_> = expected.difference(&tested).collect(); + assert!(diff.is_empty(), "Not all settings were tested, please add the following settings to the `test_setting_routes!` macro: {:?}", diff); + } + }; +} + +test_setting_routes!( + { + setting: filterable_attributes, + update_verb: put, + default_value: [] + }, + { + setting: displayed_attributes, + update_verb: put, + default_value: ["*"] + }, + { + setting: localized_attributes, + update_verb: put, + default_value: null + }, + { + setting: searchable_attributes, + update_verb: put, + default_value: ["*"] + }, + { + setting: distinct_attribute, + update_verb: put, + default_value: null + }, + { + setting: stop_words, + update_verb: put, + default_value: [] + }, + { + setting: separator_tokens, + update_verb: put, + default_value: [] + }, + { + setting: non_separator_tokens, + update_verb: put, + default_value: [] + }, + { + setting: dictionary, + update_verb: put, + default_value: [] + }, + { + setting: ranking_rules, + update_verb: put, + default_value: ["words", "typo", "proximity", "attribute", "sort", "exactness"] + }, + { + setting: synonyms, + update_verb: put, + default_value: {} + }, + { + setting: pagination, + update_verb: patch, + default_value: {"maxTotalHits": 1000} + }, + { + setting: faceting, + update_verb: patch, + default_value: {"maxValuesPerFacet": 100, "sortFacetValuesBy": {"*": "alpha"}} + }, + { + setting: search_cutoff_ms, + update_verb: put, + default_value: null + }, + { + setting: embedders, + update_verb: patch, + default_value: null + }, + { + setting: facet_search, + update_verb: put, + default_value: true + }, + { + setting: prefix_search, + update_verb: put, + default_value: "indexingTime" + }, + { + setting: proximity_precision, + update_verb: put, + default_value: "byWord" + }, + { + setting: sortable_attributes, + update_verb: put, + default_value: [] + }, + { + setting: typo_tolerance, + update_verb: patch, + default_value: {"enabled": true, "minWordSizeForTypos": {"oneTypo": 5, "twoTypos": 9}, "disableOnWords": [], "disableOnAttributes": []} + }, +); #[actix_rt::test] async fn get_settings_unexisting_index() { @@ -56,7 +197,7 @@ async fn get_settings() { let (response, code) = index.settings().await; assert_eq!(code, 200); let settings = response.as_object().unwrap(); - assert_eq!(settings.keys().len(), 17); + assert_eq!(settings.keys().len(), 19); assert_eq!(settings["displayedAttributes"], json!(["*"])); assert_eq!(settings["searchableAttributes"], json!(["*"])); assert_eq!(settings["filterableAttributes"], json!([])); @@ -87,6 +228,8 @@ async fn get_settings() { ); assert_eq!(settings["proximityPrecision"], json!("byWord")); assert_eq!(settings["searchCutoffMs"], json!(null)); + assert_eq!(settings["prefixSearch"], json!("indexingTime")); + assert_eq!(settings["facetSearch"], json!(true)); } #[actix_rt::test] @@ -199,7 +342,9 @@ async fn secrets_are_hidden_in_settings() { } }, "searchCutoffMs": null, - "localizedAttributes": null + "localizedAttributes": null, + "facetSearch": true, + "prefixSearch": "indexingTime" } "###); @@ -338,93 +483,6 @@ async fn error_update_setting_unexisting_index_invalid_uid() { "###); } -macro_rules! test_setting_routes { - ($($setting:ident $write_method:ident), *) => { - $( - mod $setting { - use crate::common::Server; - use super::DEFAULT_SETTINGS_VALUES; - - #[actix_rt::test] - async fn get_unexisting_index() { - let server = Server::new().await; - let url = format!("/indexes/test/settings/{}", - stringify!($setting) - .chars() - .map(|c| if c == '_' { '-' } else { c }) - .collect::()); - let (_response, code) = server.service.get(url).await; - assert_eq!(code, 404); - } - - #[actix_rt::test] - async fn update_unexisting_index() { - let server = Server::new().await; - let url = format!("/indexes/test/settings/{}", - stringify!($setting) - .chars() - .map(|c| if c == '_' { '-' } else { c }) - .collect::()); - let (response, code) = server.service.$write_method(url, serde_json::Value::Null.into()).await; - assert_eq!(code, 202, "{}", response); - server.index("").wait_task(0).await; - let (response, code) = server.index("test").get().await; - assert_eq!(code, 200, "{}", response); - } - - #[actix_rt::test] - async fn delete_unexisting_index() { - let server = Server::new().await; - let url = format!("/indexes/test/settings/{}", - stringify!($setting) - .chars() - .map(|c| if c == '_' { '-' } else { c }) - .collect::()); - let (_, code) = server.service.delete(url).await; - assert_eq!(code, 202); - let response = server.index("").wait_task(0).await; - assert_eq!(response["status"], "failed"); - } - - #[actix_rt::test] - async fn get_default() { - let server = Server::new().await; - let index = server.index("test"); - let (response, code) = index.create(None).await; - assert_eq!(code, 202, "{}", response); - index.wait_task(0).await; - let url = format!("/indexes/test/settings/{}", - stringify!($setting) - .chars() - .map(|c| if c == '_' { '-' } else { c }) - .collect::()); - let (response, code) = server.service.get(url).await; - assert_eq!(code, 200, "{}", response); - let expected = DEFAULT_SETTINGS_VALUES.get(stringify!($setting)).unwrap(); - assert_eq!(expected, &response); - } - } - )* - }; -} - -test_setting_routes!( - filterable_attributes put, - displayed_attributes put, - localized_attributes put, - searchable_attributes put, - distinct_attribute put, - stop_words put, - separator_tokens put, - non_separator_tokens put, - dictionary put, - ranking_rules put, - synonyms put, - pagination patch, - faceting patch, - search_cutoff_ms put -); - #[actix_rt::test] async fn error_set_invalid_ranking_rules() { let server = Server::new().await; diff --git a/crates/meilisearch/tests/settings/mod.rs b/crates/meilisearch/tests/settings/mod.rs index ccb4139e6..67df4068a 100644 --- a/crates/meilisearch/tests/settings/mod.rs +++ b/crates/meilisearch/tests/settings/mod.rs @@ -1,5 +1,6 @@ mod distinct; mod errors; mod get_settings; +mod prefix_search_settings; mod proximity_settings; mod tokenizer_customization; diff --git a/crates/meilisearch/tests/settings/prefix_search_settings.rs b/crates/meilisearch/tests/settings/prefix_search_settings.rs new file mode 100644 index 000000000..5da758a7d --- /dev/null +++ b/crates/meilisearch/tests/settings/prefix_search_settings.rs @@ -0,0 +1,458 @@ +use meili_snap::{json_string, snapshot}; +use once_cell::sync::Lazy; + +use crate::common::Server; +use crate::json; + +static DOCUMENTS: Lazy = Lazy::new(|| { + json!([ + { + "id": 1, + "a": "Soup of the day", + "b": "manythefishou", + }, + { + "id": 2, + "a": "Soup of day so", + "b": "manythe manythelazyfish", + }, + { + "id": 3, + "a": "the Soup of day", + "b": "manythelazyfish", + }, + ]) +}); + +#[actix_rt::test] +async fn add_docs_and_disable() { + let server = Server::new().await; + let index = server.index("test"); + + let (response, _code) = index.add_documents(DOCUMENTS.clone(), None).await; + index.wait_task(response.uid()).await; + + let (response, code) = index + .update_settings(json!({ + "prefixSearch": "disabled", + "rankingRules": ["words", "typo", "proximity"], + })) + .await; + assert_eq!("202", code.as_str(), "{:?}", response); + index.wait_task(response.uid()).await; + + // only 1 document should match + index + .search(json!({"q": "so", "attributesToHighlight": ["a", "b"]}), |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 2, + "a": "Soup of day so", + "b": "manythe manythelazyfish", + "_formatted": { + "id": "2", + "a": "Soup of day so", + "b": "manythe manythelazyfish" + } + } + ] + "###); + }) + .await; + + // only 1 document should match + index + .search(json!({"q": "manythe", "attributesToHighlight": ["a", "b"]}), |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 2, + "a": "Soup of day so", + "b": "manythe manythelazyfish", + "_formatted": { + "id": "2", + "a": "Soup of day so", + "b": "manythe manythelazyfish" + } + } + ] + "###); + }) + .await; +} + +#[actix_rt::test] +async fn disable_and_add_docs() { + let server = Server::new().await; + let index = server.index("test"); + + let (response, code) = index + .update_settings(json!({ + "prefixSearch": "disabled", + "rankingRules": ["words", "typo", "proximity"], + })) + .await; + assert_eq!("202", code.as_str(), "{:?}", response); + index.wait_task(response.uid()).await; + + let (response, _code) = index.add_documents(DOCUMENTS.clone(), None).await; + index.wait_task(response.uid()).await; + + // only 1 document should match + index + .search(json!({"q": "so", "attributesToHighlight": ["a", "b"]}), |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 2, + "a": "Soup of day so", + "b": "manythe manythelazyfish", + "_formatted": { + "id": "2", + "a": "Soup of day so", + "b": "manythe manythelazyfish" + } + } + ] + "###); + }) + .await; + + index + .search(json!({"q": "manythe", "attributesToHighlight": ["a", "b"]}), |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 2, + "a": "Soup of day so", + "b": "manythe manythelazyfish", + "_formatted": { + "id": "2", + "a": "Soup of day so", + "b": "manythe manythelazyfish" + } + } + ] + "###); + }) + .await; +} + +#[actix_rt::test] +async fn disable_add_docs_and_enable() { + let server = Server::new().await; + let index = server.index("test"); + + let (response, code) = index + .update_settings(json!({ + "prefixSearch": "disabled", + "rankingRules": ["words", "typo", "proximity"], + })) + .await; + assert_eq!("202", code.as_str(), "{:?}", response); + index.wait_task(response.uid()).await; + + let (response, _code) = index.add_documents(DOCUMENTS.clone(), None).await; + index.wait_task(response.uid()).await; + + let (response, code) = index + .update_settings(json!({ + "prefixSearch": "indexingTime", + "rankingRules": ["words", "typo", "proximity"], + })) + .await; + assert_eq!("202", code.as_str(), "{:?}", response); + index.wait_task(2).await; + + // all documents should match + index + .search(json!({"q": "so", "attributesToHighlight": ["a", "b"]}), |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 1, + "a": "Soup of the day", + "b": "manythefishou", + "_formatted": { + "id": "1", + "a": "Soup of the day", + "b": "manythefishou" + } + }, + { + "id": 2, + "a": "Soup of day so", + "b": "manythe manythelazyfish", + "_formatted": { + "id": "2", + "a": "Soup of day so", + "b": "manythe manythelazyfish" + } + }, + { + "id": 3, + "a": "the Soup of day", + "b": "manythelazyfish", + "_formatted": { + "id": "3", + "a": "the Soup of day", + "b": "manythelazyfish" + } + } + ] + "###); + }) + .await; + + index + .search(json!({"q": "manythe", "attributesToHighlight": ["a", "b"]}), |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 1, + "a": "Soup of the day", + "b": "manythefishou", + "_formatted": { + "id": "1", + "a": "Soup of the day", + "b": "manythefishou" + } + }, + { + "id": 2, + "a": "Soup of day so", + "b": "manythe manythelazyfish", + "_formatted": { + "id": "2", + "a": "Soup of day so", + "b": "manythe manythelazyfish" + } + }, + { + "id": 3, + "a": "the Soup of day", + "b": "manythelazyfish", + "_formatted": { + "id": "3", + "a": "the Soup of day", + "b": "manythelazyfish" + } + } + ] + "###); + }) + .await; +} + +#[actix_rt::test] +async fn disable_add_docs_and_reset() { + let server = Server::new().await; + let index = server.index("test"); + + let (response, code) = index + .update_settings(json!({ + "prefixSearch": "disabled", + "rankingRules": ["words", "typo", "proximity"], + })) + .await; + assert_eq!("202", code.as_str(), "{:?}", response); + index.wait_task(response.uid()).await; + + let (response, _code) = index.add_documents(DOCUMENTS.clone(), None).await; + index.wait_task(response.uid()).await; + + let (response, code) = index + .update_settings(json!({ + "prefixSearch": serde_json::Value::Null, + "rankingRules": ["words", "typo", "proximity"], + })) + .await; + assert_eq!("202", code.as_str(), "{:?}", response); + index.wait_task(2).await; + + // all documents should match + index + .search(json!({"q": "so", "attributesToHighlight": ["a", "b"]}), |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 1, + "a": "Soup of the day", + "b": "manythefishou", + "_formatted": { + "id": "1", + "a": "Soup of the day", + "b": "manythefishou" + } + }, + { + "id": 2, + "a": "Soup of day so", + "b": "manythe manythelazyfish", + "_formatted": { + "id": "2", + "a": "Soup of day so", + "b": "manythe manythelazyfish" + } + }, + { + "id": 3, + "a": "the Soup of day", + "b": "manythelazyfish", + "_formatted": { + "id": "3", + "a": "the Soup of day", + "b": "manythelazyfish" + } + } + ] + "###); + }) + .await; + + index + .search(json!({"q": "manythe", "attributesToHighlight": ["a", "b"]}), |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 1, + "a": "Soup of the day", + "b": "manythefishou", + "_formatted": { + "id": "1", + "a": "Soup of the day", + "b": "manythefishou" + } + }, + { + "id": 2, + "a": "Soup of day so", + "b": "manythe manythelazyfish", + "_formatted": { + "id": "2", + "a": "Soup of day so", + "b": "manythe manythelazyfish" + } + }, + { + "id": 3, + "a": "the Soup of day", + "b": "manythelazyfish", + "_formatted": { + "id": "3", + "a": "the Soup of day", + "b": "manythelazyfish" + } + } + ] + "###); + }) + .await; +} + +#[actix_rt::test] +async fn default_behavior() { + let server = Server::new().await; + let index = server.index("test"); + + let (response, code) = index + .update_settings(json!({ + "rankingRules": ["words", "typo", "proximity"], + })) + .await; + assert_eq!("202", code.as_str(), "{:?}", response); + index.wait_task(response.uid()).await; + + let (response, _code) = index.add_documents(DOCUMENTS.clone(), None).await; + index.wait_task(response.uid()).await; + + // all documents should match + index + .search(json!({"q": "so", "attributesToHighlight": ["a", "b"]}), |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 1, + "a": "Soup of the day", + "b": "manythefishou", + "_formatted": { + "id": "1", + "a": "Soup of the day", + "b": "manythefishou" + } + }, + { + "id": 2, + "a": "Soup of day so", + "b": "manythe manythelazyfish", + "_formatted": { + "id": "2", + "a": "Soup of day so", + "b": "manythe manythelazyfish" + } + }, + { + "id": 3, + "a": "the Soup of day", + "b": "manythelazyfish", + "_formatted": { + "id": "3", + "a": "the Soup of day", + "b": "manythelazyfish" + } + } + ] + "###); + }) + .await; + + index + .search(json!({"q": "manythe", "attributesToHighlight": ["a", "b"]}), |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 1, + "a": "Soup of the day", + "b": "manythefishou", + "_formatted": { + "id": "1", + "a": "Soup of the day", + "b": "manythefishou" + } + }, + { + "id": 2, + "a": "Soup of day so", + "b": "manythe manythelazyfish", + "_formatted": { + "id": "2", + "a": "Soup of day so", + "b": "manythe manythelazyfish" + } + }, + { + "id": 3, + "a": "the Soup of day", + "b": "manythelazyfish", + "_formatted": { + "id": "3", + "a": "the Soup of day", + "b": "manythelazyfish" + } + } + ] + "###); + }) + .await; +} diff --git a/crates/meilisearch/tests/similar/errors.rs b/crates/meilisearch/tests/similar/errors.rs index 1e933e1c0..86fca97ad 100644 --- a/crates/meilisearch/tests/similar/errors.rs +++ b/crates/meilisearch/tests/similar/errors.rs @@ -79,7 +79,7 @@ async fn similar_bad_id() { snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r###" { - "message": "Invalid value at `.id`: the value of `id` is invalid. A document identifier can be of type integer or string, only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_), and can not be more than 512 bytes.", + "message": "Invalid value at `.id`: the value of `id` is invalid. A document identifier can be of type integer or string, only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_), and can not be more than 511 bytes.", "code": "invalid_similar_id", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_similar_id" @@ -172,7 +172,7 @@ async fn similar_invalid_id() { snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r###" { - "message": "Invalid value at `.id`: the value of `id` is invalid. A document identifier can be of type integer or string, only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_), and can not be more than 512 bytes.", + "message": "Invalid value at `.id`: the value of `id` is invalid. A document identifier can be of type integer or string, only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_), and can not be more than 511 bytes.", "code": "invalid_similar_id", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_similar_id" diff --git a/crates/meilisearch/tests/snapshot/mod.rs b/crates/meilisearch/tests/snapshot/mod.rs index 976551190..0d569fc7c 100644 --- a/crates/meilisearch/tests/snapshot/mod.rs +++ b/crates/meilisearch/tests/snapshot/mod.rs @@ -129,11 +129,11 @@ async fn perform_on_demand_snapshot() { index.load_test_set().await; - server.index("doggo").create(Some("bone")).await; - index.wait_task(2).await; + let (task, _) = server.index("doggo").create(Some("bone")).await; + index.wait_task(task.uid()).await.succeeded(); - server.index("doggo").create(Some("bone")).await; - index.wait_task(2).await; + let (task, _) = server.index("doggo").create(Some("bone")).await; + index.wait_task(task.uid()).await.failed(); let (task, code) = server.create_snapshot().await; snapshot!(code, @"202 Accepted"); diff --git a/crates/meilisearch/tests/tasks/mod.rs b/crates/meilisearch/tests/tasks/mod.rs index fc05ee4ca..c9d3f31ed 100644 --- a/crates/meilisearch/tests/tasks/mod.rs +++ b/crates/meilisearch/tests/tasks/mod.rs @@ -448,7 +448,7 @@ async fn test_summarized_delete_documents_by_filter() { "originalFilter": "\"doggo = bernese\"" }, "error": { - "message": "Attribute `doggo` is not filterable. This index does not have configured filterable attributes.\n1:6 doggo = bernese", + "message": "Index `test`: Attribute `doggo` is not filterable. This index does not have configured filterable attributes.\n1:6 doggo = bernese", "code": "invalid_document_filter", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_document_filter" diff --git a/crates/meilisearch/tests/vector/binary_quantized.rs b/crates/meilisearch/tests/vector/binary_quantized.rs index 560c4e2f2..790df5459 100644 --- a/crates/meilisearch/tests/vector/binary_quantized.rs +++ b/crates/meilisearch/tests/vector/binary_quantized.rs @@ -318,7 +318,7 @@ async fn try_to_disable_binary_quantization() { } }, "error": { - "message": "`.embedders.manual.binaryQuantized`: Cannot disable the binary quantization.\n - Note: Binary quantization is a lossy operation that cannot be reverted.\n - Hint: Add a new embedder that is non-quantized and regenerate the vectors.", + "message": "Index `doggo`: `.embedders.manual.binaryQuantized`: Cannot disable the binary quantization.\n - Note: Binary quantization is a lossy operation that cannot be reverted.\n - Hint: Add a new embedder that is non-quantized and regenerate the vectors.", "code": "invalid_settings_embedders", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_settings_embedders" diff --git a/crates/meilisearch/tests/vector/mod.rs b/crates/meilisearch/tests/vector/mod.rs index bb20d7b2a..adad9fa81 100644 --- a/crates/meilisearch/tests/vector/mod.rs +++ b/crates/meilisearch/tests/vector/mod.rs @@ -250,7 +250,7 @@ async fn user_provided_embeddings_error() { "indexedDocuments": 0 }, "error": { - "message": "Bad embedder configuration in the document with id: `0`. Missing field `._vectors.manual.regenerate`\n - note: `._vectors.manual` must be an array of floats, an array of arrays of floats, or an object with field `regenerate`", + "message": "Index `doggo`: Bad embedder configuration in the document with id: `0`. Missing field `._vectors.manual.regenerate`\n - note: `._vectors.manual` must be an array of floats, an array of arrays of floats, or an object with field `regenerate`", "code": "invalid_vectors_type", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_vectors_type" @@ -280,7 +280,7 @@ async fn user_provided_embeddings_error() { "indexedDocuments": 0 }, "error": { - "message": "Bad embedder configuration in the document with id: `0`. Missing field `._vectors.manual.regenerate`\n - note: `._vectors.manual` must be an array of floats, an array of arrays of floats, or an object with field `regenerate`", + "message": "Index `doggo`: Bad embedder configuration in the document with id: `0`. Missing field `._vectors.manual.regenerate`\n - note: `._vectors.manual` must be an array of floats, an array of arrays of floats, or an object with field `regenerate`", "code": "invalid_vectors_type", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_vectors_type" @@ -311,7 +311,7 @@ async fn user_provided_embeddings_error() { "indexedDocuments": 0 }, "error": { - "message": "Bad embedder configuration in the document with id: `0`. Could not parse `._vectors.manual.regenerate`: invalid type: string \"yes please\", expected a boolean at line 1 column 26", + "message": "Index `doggo`: Bad embedder configuration in the document with id: `0`. Could not parse `._vectors.manual.regenerate`: invalid type: string \"yes please\", expected a boolean at line 1 column 26", "code": "invalid_vectors_type", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_vectors_type" @@ -340,7 +340,7 @@ async fn user_provided_embeddings_error() { "indexedDocuments": 0 }, "error": { - "message": "Bad embedder configuration in the document with id: `0`. Invalid value type at `._vectors.manual.embeddings`: expected null or an array, but found a boolean: `true`", + "message": "Index `doggo`: Bad embedder configuration in the document with id: `0`. Invalid value type at `._vectors.manual.embeddings`: expected null or an array, but found a boolean: `true`", "code": "invalid_vectors_type", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_vectors_type" @@ -369,7 +369,7 @@ async fn user_provided_embeddings_error() { "indexedDocuments": 0 }, "error": { - "message": "Bad embedder configuration in the document with id: `0`. Invalid value type at `._vectors.manual.embeddings[0]`: expected a number or an array, but found a boolean: `true`", + "message": "Index `doggo`: Bad embedder configuration in the document with id: `0`. Invalid value type at `._vectors.manual.embeddings[0]`: expected a number or an array, but found a boolean: `true`", "code": "invalid_vectors_type", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_vectors_type" @@ -398,7 +398,7 @@ async fn user_provided_embeddings_error() { "indexedDocuments": 0 }, "error": { - "message": "Bad embedder configuration in the document with id: `0`. Invalid value type at `._vectors.manual.embeddings[0][0]`: expected a number, but found a boolean: `true`", + "message": "Index `doggo`: Bad embedder configuration in the document with id: `0`. Invalid value type at `._vectors.manual.embeddings[0][0]`: expected a number, but found a boolean: `true`", "code": "invalid_vectors_type", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_vectors_type" @@ -440,7 +440,7 @@ async fn user_provided_embeddings_error() { "indexedDocuments": 0 }, "error": { - "message": "Bad embedder configuration in the document with id: `0`. Invalid value type at `._vectors.manual.embeddings[1]`: expected a number, but found an array: `[0.2,0.3]`", + "message": "Index `doggo`: Bad embedder configuration in the document with id: `0`. Invalid value type at `._vectors.manual.embeddings[1]`: expected a number, but found an array: `[0.2,0.3]`", "code": "invalid_vectors_type", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_vectors_type" @@ -469,7 +469,7 @@ async fn user_provided_embeddings_error() { "indexedDocuments": 0 }, "error": { - "message": "Bad embedder configuration in the document with id: `0`. Invalid value type at `._vectors.manual.embeddings[1]`: expected an array, but found a number: `0.3`", + "message": "Index `doggo`: Bad embedder configuration in the document with id: `0`. Invalid value type at `._vectors.manual.embeddings[1]`: expected an array, but found a number: `0.3`", "code": "invalid_vectors_type", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_vectors_type" @@ -498,7 +498,7 @@ async fn user_provided_embeddings_error() { "indexedDocuments": 0 }, "error": { - "message": "Bad embedder configuration in the document with id: `0`. Invalid value type at `._vectors.manual.embeddings[0][1]`: expected a number, but found a boolean: `true`", + "message": "Index `doggo`: Bad embedder configuration in the document with id: `0`. Invalid value type at `._vectors.manual.embeddings[0][1]`: expected a number, but found a boolean: `true`", "code": "invalid_vectors_type", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_vectors_type" @@ -539,7 +539,7 @@ async fn user_provided_vectors_error() { "indexedDocuments": 0 }, "error": { - "message": "While embedding documents for embedder `manual`: no vectors provided for document `40` and at least 4 other document(s)\n- Note: `manual` has `source: userProvided`, so documents must provide embeddings as an array in `_vectors.manual`.\n- Hint: opt-out for a document with `_vectors.manual: null`", + "message": "Index `doggo`: While embedding documents for embedder `manual`: no vectors provided for document `40` and at least 4 other document(s)\n- Note: `manual` has `source: userProvided`, so documents must provide embeddings as an array in `_vectors.manual`.\n- Hint: opt-out for a document with `_vectors.manual: null`", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" @@ -569,7 +569,7 @@ async fn user_provided_vectors_error() { "indexedDocuments": 0 }, "error": { - "message": "While embedding documents for embedder `manual`: no vectors provided for document `42`\n- Note: `manual` has `source: userProvided`, so documents must provide embeddings as an array in `_vectors.manual`.\n- Hint: try replacing `_vector` by `_vectors` in 1 document(s).", + "message": "Index `doggo`: While embedding documents for embedder `manual`: no vectors provided for document `42`\n- Note: `manual` has `source: userProvided`, so documents must provide embeddings as an array in `_vectors.manual`.\n- Hint: try replacing `_vector` by `_vectors` in 1 document(s).", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" @@ -599,7 +599,7 @@ async fn user_provided_vectors_error() { "indexedDocuments": 0 }, "error": { - "message": "While embedding documents for embedder `manual`: no vectors provided for document `42`\n- Note: `manual` has `source: userProvided`, so documents must provide embeddings as an array in `_vectors.manual`.\n- Hint: try replacing `_vectors.manaul` by `_vectors.manual` in 1 document(s).", + "message": "Index `doggo`: While embedding documents for embedder `manual`: no vectors provided for document `42`\n- Note: `manual` has `source: userProvided`, so documents must provide embeddings as an array in `_vectors.manual`.\n- Hint: try replacing `_vectors.manaul` by `_vectors.manual` in 1 document(s).", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" diff --git a/crates/meilisearch/tests/vector/openai.rs b/crates/meilisearch/tests/vector/openai.rs index 99aa1f710..b02111639 100644 --- a/crates/meilisearch/tests/vector/openai.rs +++ b/crates/meilisearch/tests/vector/openai.rs @@ -713,7 +713,7 @@ async fn bad_api_key() { } }, "error": { - "message": "While embedding documents for embedder `default`: user error: could not authenticate against OpenAI server\n - server replied with `{\"error\":{\"message\":\"Incorrect API key provided: Bearer doggo. You can find your API key at https://platform.openai.com/account/api-keys.\",\"type\":\"invalid_request_error\",\"param\":null,\"code\":\"invalid_api_key\"}}`\n - Hint: Check the `apiKey` parameter in the embedder configuration, and the `MEILI_OPENAI_API_KEY` and `OPENAI_API_KEY` environment variables", + "message": "Index `doggo`: While embedding documents for embedder `default`: user error: could not authenticate against OpenAI server\n - server replied with `{\"error\":{\"message\":\"Incorrect API key provided: Bearer doggo. You can find your API key at https://platform.openai.com/account/api-keys.\",\"type\":\"invalid_request_error\",\"param\":null,\"code\":\"invalid_api_key\"}}`\n - Hint: Check the `apiKey` parameter in the embedder configuration, and the `MEILI_OPENAI_API_KEY` and `OPENAI_API_KEY` environment variables", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" @@ -757,7 +757,7 @@ async fn bad_api_key() { } }, "error": { - "message": "While embedding documents for embedder `default`: user error: could not authenticate against OpenAI server\n - server replied with `{\"error\":{\"message\":\"You didn't provide an API key. You need to provide your API key in an Authorization header using Bearer auth (i.e. Authorization: Bearer YOUR_KEY), or as the password field (with blank username) if you're accessing the API from your browser and are prompted for a username and password. You can obtain an API key from https://platform.openai.com/account/api-keys.\",\"type\":\"invalid_request_error\",\"param\":null,\"code\":null}}`\n - Hint: Check the `apiKey` parameter in the embedder configuration, and the `MEILI_OPENAI_API_KEY` and `OPENAI_API_KEY` environment variables", + "message": "Index `doggo`: While embedding documents for embedder `default`: user error: could not authenticate against OpenAI server\n - server replied with `{\"error\":{\"message\":\"You didn't provide an API key. You need to provide your API key in an Authorization header using Bearer auth (i.e. Authorization: Bearer YOUR_KEY), or as the password field (with blank username) if you're accessing the API from your browser and are prompted for a username and password. You can obtain an API key from https://platform.openai.com/account/api-keys.\",\"type\":\"invalid_request_error\",\"param\":null,\"code\":null}}`\n - Hint: Check the `apiKey` parameter in the embedder configuration, and the `MEILI_OPENAI_API_KEY` and `OPENAI_API_KEY` environment variables", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" diff --git a/crates/meilisearch/tests/vector/rest.rs b/crates/meilisearch/tests/vector/rest.rs index cadc54f24..bf6876fbe 100644 --- a/crates/meilisearch/tests/vector/rest.rs +++ b/crates/meilisearch/tests/vector/rest.rs @@ -985,7 +985,7 @@ async fn bad_settings() { } }, "error": { - "message": "Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with runtime error: error extracting embeddings from the response:\n - in `response`, while extracting a single \"{{embedding}}\", expected `response` to be an array of numbers, but failed to parse server response:\n - invalid type: map, expected a sequence", + "message": "Index `doggo`: Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with runtime error: error extracting embeddings from the response:\n - in `response`, while extracting a single \"{{embedding}}\", expected `response` to be an array of numbers, but failed to parse server response:\n - invalid type: map, expected a sequence", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" @@ -1025,7 +1025,7 @@ async fn bad_settings() { "indexedDocuments": 0 }, "error": { - "message": "While embedding documents for embedder `rest`: runtime error: was expecting embeddings of dimension `2`, got embeddings of dimensions `3`", + "message": "Index `doggo`: While embedding documents for embedder `rest`: runtime error: was expecting embeddings of dimension `2`, got embeddings of dimensions `3`", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" @@ -1178,7 +1178,7 @@ async fn server_returns_bad_request() { } }, "error": { - "message": "Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with user error: sent a bad request to embedding server\n - Hint: check that the `request` in the embedder configuration matches the remote server's API\n - server replied with `{\"error\":\"Invalid request: invalid type: string \\\"test\\\", expected struct MultipleRequest at line 1 column 6\"}`", + "message": "Index `doggo`: Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with user error: sent a bad request to embedding server\n - Hint: check that the `request` in the embedder configuration matches the remote server's API\n - server replied with `{\"error\":\"Invalid request: invalid type: string \\\"test\\\", expected struct MultipleRequest at line 1 column 6\"}`", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" @@ -1247,7 +1247,7 @@ async fn server_returns_bad_request() { "indexedDocuments": 0 }, "error": { - "message": "While embedding documents for embedder `rest`: user error: sent a bad request to embedding server\n - Hint: check that the `request` in the embedder configuration matches the remote server's API\n - server replied with `{\"error\":\"Invalid request: invalid type: string \\\"name: kefir\\\\n\\\", expected struct MultipleRequest at line 1 column 15\"}`", + "message": "Index `doggo`: While embedding documents for embedder `rest`: user error: sent a bad request to embedding server\n - Hint: check that the `request` in the embedder configuration matches the remote server's API\n - server replied with `{\"error\":\"Invalid request: invalid type: string \\\"name: kefir\\\\n\\\", expected struct MultipleRequest at line 1 column 15\"}`", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" @@ -1306,7 +1306,7 @@ async fn server_returns_bad_response() { } }, "error": { - "message": "Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with runtime error: error extracting embeddings from the response:\n - in `response`, while extracting the array of \"{{embedding}}\"s, configuration expects `response` to be an array with at least 1 item(s) but server sent an object with 1 field(s)", + "message": "Index `doggo`: Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with runtime error: error extracting embeddings from the response:\n - in `response`, while extracting the array of \"{{embedding}}\"s, configuration expects `response` to be an array with at least 1 item(s) but server sent an object with 1 field(s)", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" @@ -1362,7 +1362,7 @@ async fn server_returns_bad_response() { } }, "error": { - "message": "Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with runtime error: error extracting embeddings from the response:\n - in `response`, while extracting item #0 from the array of \"{{embedding}}\"s, expected `response` to be an array of numbers, but failed to parse server response:\n - invalid type: map, expected a sequence", + "message": "Index `doggo`: Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with runtime error: error extracting embeddings from the response:\n - in `response`, while extracting item #0 from the array of \"{{embedding}}\"s, expected `response` to be an array of numbers, but failed to parse server response:\n - invalid type: map, expected a sequence", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" @@ -1414,7 +1414,7 @@ async fn server_returns_bad_response() { } }, "error": { - "message": "Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with runtime error: error extracting embeddings from the response:\n - in `response.output`, while extracting a single \"{{embedding}}\", expected `output` to be an array of numbers, but failed to parse server response:\n - invalid type: map, expected f32", + "message": "Index `doggo`: Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with runtime error: error extracting embeddings from the response:\n - in `response.output`, while extracting a single \"{{embedding}}\", expected `output` to be an array of numbers, but failed to parse server response:\n - invalid type: map, expected f32", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" @@ -1478,7 +1478,7 @@ async fn server_returns_bad_response() { } }, "error": { - "message": "Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with runtime error: error extracting embeddings from the response:\n - in `response.embedding`, while extracting item #0 from the array of \"{{embedding}}\"s, configuration expects `embedding` to be an object with key `data` but server sent an array of size 3", + "message": "Index `doggo`: Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with runtime error: error extracting embeddings from the response:\n - in `response.embedding`, while extracting item #0 from the array of \"{{embedding}}\"s, configuration expects `embedding` to be an object with key `data` but server sent an array of size 3", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" @@ -1542,7 +1542,7 @@ async fn server_returns_bad_response() { } }, "error": { - "message": "Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with runtime error: error extracting embeddings from the response:\n - in `response.output[0]`, while extracting a single \"{{embedding}}\", configuration expects key \"embeddings\", which is missing in response\n - Hint: item #0 inside `output` has key `embedding`, did you mean `response.output[0].embedding` in embedder configuration?", + "message": "Index `doggo`: Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with runtime error: error extracting embeddings from the response:\n - in `response.output[0]`, while extracting a single \"{{embedding}}\", configuration expects key \"embeddings\", which is missing in response\n - Hint: item #0 inside `output` has key `embedding`, did you mean `response.output[0].embedding` in embedder configuration?", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" @@ -1908,7 +1908,7 @@ async fn server_custom_header() { } }, "error": { - "message": "Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with user error: could not authenticate against embedding server\n - server replied with `{\"error\":\"missing header 'my-nonstandard-auth'\"}`\n - Hint: Check the `apiKey` parameter in the embedder configuration", + "message": "Index `doggo`: Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with user error: could not authenticate against embedding server\n - server replied with `{\"error\":\"missing header 'my-nonstandard-auth'\"}`\n - Hint: Check the `apiKey` parameter in the embedder configuration", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" @@ -1951,7 +1951,7 @@ async fn server_custom_header() { } }, "error": { - "message": "Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with user error: could not authenticate against embedding server\n - server replied with `{\"error\":\"thou shall not pass, Balrog\"}`\n - Hint: Check the `apiKey` parameter in the embedder configuration", + "message": "Index `doggo`: Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with user error: could not authenticate against embedding server\n - server replied with `{\"error\":\"thou shall not pass, Balrog\"}`\n - Hint: Check the `apiKey` parameter in the embedder configuration", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" @@ -2099,7 +2099,7 @@ async fn searchable_reindex() { ] }, "error": { - "message": "While embedding documents for embedder `rest`: error: received unexpected HTTP 404 from embedding server\n - server replied with `{\"error\":\"text not found\",\"text\":\"breed: patou\\n\"}`", + "message": "Index `doggo`: While embedding documents for embedder `rest`: error: received unexpected HTTP 404 from embedding server\n - server replied with `{\"error\":\"text not found\",\"text\":\"breed: patou\\n\"}`", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" diff --git a/crates/meilitool/Cargo.toml b/crates/meilitool/Cargo.toml index 048da6232..7d0b9f32c 100644 --- a/crates/meilitool/Cargo.toml +++ b/crates/meilitool/Cargo.toml @@ -10,12 +10,15 @@ license.workspace = true [dependencies] anyhow = "1.0.86" +arroy_v04_to_v05 = { package = "arroy", git = "https://github.com/meilisearch/arroy/", tag = "DO-NOT-DELETE-upgrade-v04-to-v05" } clap = { version = "4.5.9", features = ["derive"] } dump = { path = "../dump" } file-store = { path = "../file-store" } +indexmap = {version = "2.7.0", features = ["serde"]} meilisearch-auth = { path = "../meilisearch-auth" } meilisearch-types = { path = "../meilisearch-types" } serde = { version = "1.0.209", features = ["derive"] } +serde_json = {version = "1.0.133", features = ["preserve_order"]} +tempfile = "3.14.0" time = { version = "0.3.36", features = ["formatting", "parsing", "alloc"] } uuid = { version = "1.10.0", features = ["v4"], default-features = false } -arroy_v04_to_v05 = { package = "arroy", git = "https://github.com/meilisearch/arroy/", tag = "DO-NOT-DELETE-upgrade-v04-to-v05" } diff --git a/crates/meilitool/src/main.rs b/crates/meilitool/src/main.rs index f84cea98d..44eb4960e 100644 --- a/crates/meilitool/src/main.rs +++ b/crates/meilitool/src/main.rs @@ -73,7 +73,7 @@ enum Command { /// /// Supported upgrade paths: /// - /// - v1.9.x -> v1.10.x -> v1.11.x + /// - v1.9.x -> v1.10.x -> v1.11.x -> v1.12.x OfflineUpgrade { #[arg(long)] target_version: String, diff --git a/crates/meilitool/src/upgrade/mod.rs b/crates/meilitool/src/upgrade/mod.rs index 36630c3b3..14f941311 100644 --- a/crates/meilitool/src/upgrade/mod.rs +++ b/crates/meilitool/src/upgrade/mod.rs @@ -1,13 +1,14 @@ mod v1_10; mod v1_11; +mod v1_12; mod v1_9; use std::path::{Path, PathBuf}; use anyhow::{bail, Context}; use meilisearch_types::versioning::create_version_file; - use v1_10::v1_9_to_v1_10; +use v1_12::v1_11_to_v1_12; use crate::upgrade::v1_11::v1_10_to_v1_11; @@ -22,6 +23,7 @@ impl OfflineUpgrade { let upgrade_list = [ (v1_9_to_v1_10 as fn(&Path) -> Result<(), anyhow::Error>, "1", "10", "0"), (v1_10_to_v1_11, "1", "11", "0"), + (v1_11_to_v1_12, "1", "12", "0"), ]; let (current_major, current_minor, current_patch) = &self.current_version; @@ -33,6 +35,7 @@ impl OfflineUpgrade { ) { ("1", "9", _) => 0, ("1", "10", _) => 1, + ("1", "11", _) => 2, _ => { bail!("Unsupported current version {current_major}.{current_minor}.{current_patch}. Can only upgrade from v1.9 and v1.10") } @@ -43,6 +46,7 @@ impl OfflineUpgrade { let ends_at = match (target_major.as_str(), target_minor.as_str(), target_patch.as_str()) { ("1", "10", _) => 0, ("1", "11", _) => 1, + ("1", "12", _) => 2, (major, _, _) if major.starts_with('v') => { bail!("Target version must not starts with a `v`. Instead of writing `v1.9.0` write `1.9.0` for example.") } diff --git a/crates/meilitool/src/upgrade/v1_10.rs b/crates/meilitool/src/upgrade/v1_10.rs index 2efc1773c..4a49ea471 100644 --- a/crates/meilitool/src/upgrade/v1_10.rs +++ b/crates/meilitool/src/upgrade/v1_10.rs @@ -1,18 +1,13 @@ -use anyhow::bail; use std::path::Path; -use anyhow::Context; -use meilisearch_types::{ - heed::{ - types::{SerdeJson, Str}, - Database, Env, EnvOpenOptions, RoTxn, RwTxn, Unspecified, - }, - milli::index::{db_name, main_key}, -}; - -use crate::{try_opening_database, try_opening_poly_database, uuid_codec::UuidCodec}; +use anyhow::{bail, Context}; +use meilisearch_types::heed::types::{SerdeJson, Str}; +use meilisearch_types::heed::{Database, Env, EnvOpenOptions, RoTxn, RwTxn, Unspecified}; +use meilisearch_types::milli::index::{db_name, main_key}; use super::v1_9; +use crate::uuid_codec::UuidCodec; +use crate::{try_opening_database, try_opening_poly_database}; pub type FieldDistribution = std::collections::BTreeMap; diff --git a/crates/meilitool/src/upgrade/v1_11.rs b/crates/meilitool/src/upgrade/v1_11.rs index 0c84d3842..92d853dd0 100644 --- a/crates/meilitool/src/upgrade/v1_11.rs +++ b/crates/meilitool/src/upgrade/v1_11.rs @@ -7,12 +7,12 @@ use std::path::Path; use anyhow::Context; -use meilisearch_types::{ - heed::{types::Str, Database, EnvOpenOptions}, - milli::index::db_name, -}; +use meilisearch_types::heed::types::Str; +use meilisearch_types::heed::{Database, EnvOpenOptions}; +use meilisearch_types::milli::index::db_name; -use crate::{try_opening_database, try_opening_poly_database, uuid_codec::UuidCodec}; +use crate::uuid_codec::UuidCodec; +use crate::{try_opening_database, try_opening_poly_database}; pub fn v1_10_to_v1_11(db_path: &Path) -> anyhow::Result<()> { println!("Upgrading from v1.10.0 to v1.11.0"); diff --git a/crates/meilitool/src/upgrade/v1_12.rs b/crates/meilitool/src/upgrade/v1_12.rs new file mode 100644 index 000000000..444617375 --- /dev/null +++ b/crates/meilitool/src/upgrade/v1_12.rs @@ -0,0 +1,79 @@ +//! The breaking changes that happened between the v1.11 and the v1.12 are: +//! - The new indexer changed the update files format from OBKV to ndjson. https://github.com/meilisearch/meilisearch/pull/4900 + +use std::io::BufWriter; +use std::path::Path; + +use anyhow::Context; +use file_store::FileStore; +use indexmap::IndexMap; +use meilisearch_types::milli::documents::DocumentsBatchReader; +use serde_json::value::RawValue; +use tempfile::NamedTempFile; + +pub fn v1_11_to_v1_12(db_path: &Path) -> anyhow::Result<()> { + println!("Upgrading from v1.11.0 to v1.12.0"); + + convert_update_files(db_path)?; + + Ok(()) +} + +/// Convert the update files from OBKV to ndjson format. +/// +/// 1) List all the update files using the file store. +/// 2) For each update file, read the update file into a DocumentsBatchReader. +/// 3) For each document in the update file, convert the document to a JSON object. +/// 4) Write the JSON object to a tmp file in the update files directory. +/// 5) Persist the tmp file replacing the old update file. +fn convert_update_files(db_path: &Path) -> anyhow::Result<()> { + let update_files_dir_path = db_path.join("update_files"); + let file_store = FileStore::new(&update_files_dir_path).with_context(|| { + format!("while creating file store for update files dir {update_files_dir_path:?}") + })?; + + for uuid in file_store.all_uuids().context("while retrieving uuids from file store")? { + let uuid = uuid.context("while retrieving uuid from file store")?; + let update_file_path = file_store.get_update_path(uuid); + let update_file = file_store + .get_update(uuid) + .with_context(|| format!("while getting update file for uuid {uuid:?}"))?; + + let mut file = + NamedTempFile::new_in(&update_files_dir_path).map(BufWriter::new).with_context( + || format!("while creating bufwriter for update file {update_file_path:?}"), + )?; + + let reader = DocumentsBatchReader::from_reader(update_file).with_context(|| { + format!("while creating documents batch reader for update file {update_file_path:?}") + })?; + let (mut cursor, index) = reader.into_cursor_and_fields_index(); + + while let Some(document) = cursor.next_document().with_context(|| { + format!( + "while reading documents from batch reader for update file {update_file_path:?}" + ) + })? { + let mut json_document = IndexMap::new(); + for (fid, value) in document { + let field_name = index + .name(fid) + .with_context(|| format!("while getting field name for fid {fid} for update file {update_file_path:?}"))?; + let value: &RawValue = serde_json::from_slice(value)?; + json_document.insert(field_name, value); + } + + serde_json::to_writer(&mut file, &json_document)?; + } + + let file = file.into_inner().map_err(|e| e.into_error()).context(format!( + "while flushing update file bufwriter for update file {update_file_path:?}" + ))?; + let _ = file + // atomically replace the obkv file with the rewritten NDJSON file + .persist(&update_file_path) + .with_context(|| format!("while persisting update file {update_file_path:?}"))?; + } + + Ok(()) +} diff --git a/crates/milli/Cargo.toml b/crates/milli/Cargo.toml index c47a0a354..9f113e013 100644 --- a/crates/milli/Cargo.toml +++ b/crates/milli/Cargo.toml @@ -18,8 +18,7 @@ bincode = "1.3.3" bstr = "1.9.1" bytemuck = { version = "1.18.0", features = ["extern_crate_alloc"] } byteorder = "1.5.0" -# charabia = { version = "0.9.0", default-features = false } -charabia = { git = "https://github.com/meilisearch/charabia", branch = "mutualize-char-normalizer", default-features = false } +charabia = { version = "0.9.2", default-features = false } concat-arrays = "0.1.2" crossbeam-channel = "0.5.13" deserr = "0.6.2" @@ -28,10 +27,7 @@ flatten-serde-json = { path = "../flatten-serde-json" } fst = "0.4.7" fxhash = "0.2.1" geoutils = "0.5.1" -grenad = { version = "0.4.7", default-features = false, features = [ - "rayon", # TODO Should we keep this feature - "tempfile", -], git = "https://github.com/meilisearch/grenad", branch = "various-improvements" } +grenad = { version = "0.5.0", default-features = false, features = ["rayon", "tempfile"] } heed = { version = "0.20.3", default-features = false, features = [ "serde-json", "serde-bincode", @@ -42,11 +38,11 @@ json-depth-checker = { path = "../json-depth-checker" } levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] } memchr = "2.5.0" memmap2 = "0.9.4" -obkv = { git = "https://github.com/kerollmops/obkv", branch = "unsized-kvreader" } +obkv = "0.3.0" once_cell = "1.19.0" ordered-float = "4.2.1" rayon = "1.10.0" -roaring = { version = "0.10.6", features = ["serde"] } +roaring = { version = "0.10.7", features = ["serde"] } rstar = { version = "0.12.0", features = ["serde"] } serde = { version = "1.0.204", features = ["derive"] } serde_json = { version = "1.0.120", features = ["preserve_order", "raw_value"] } @@ -95,13 +91,15 @@ ureq = { version = "2.10.0", features = ["json"] } url = "2.5.2" rayon-par-bridge = "0.1.0" hashbrown = "0.15.0" -raw-collections = { git = "https://github.com/meilisearch/raw-collections.git", version = "0.1.0" } bumpalo = "3.16.0" +bumparaw-collections = "0.1.2" thread_local = "1.1.8" allocator-api2 = "0.2.18" rustc-hash = "2.0.0" uell = "0.1.0" enum-iterator = "2.1.0" +bbqueue = { git = "https://github.com/meilisearch/bbqueue" } +flume = { version = "0.11.1", default-features = false } [dev-dependencies] mimalloc = { version = "0.1.43", default-features = false } diff --git a/crates/milli/src/documents/primary_key.rs b/crates/milli/src/documents/primary_key.rs index fb8b3d027..c1dd9a9b8 100644 --- a/crates/milli/src/documents/primary_key.rs +++ b/crates/milli/src/documents/primary_key.rs @@ -280,7 +280,7 @@ fn starts_with(selector: &str, key: &str) -> bool { pub fn validate_document_id_str(document_id: &str) -> Option<&str> { if document_id.is_empty() - || document_id.len() > 512 + || document_id.len() >= 512 || !document_id.chars().all(|c| c.is_ascii_alphanumeric() || c == '-' || c == '_') { None diff --git a/crates/milli/src/error.rs b/crates/milli/src/error.rs index 6c60dcecc..2bd57bba5 100644 --- a/crates/milli/src/error.rs +++ b/crates/milli/src/error.rs @@ -3,6 +3,7 @@ use std::convert::Infallible; use std::fmt::Write; use std::{io, str}; +use bstr::BString; use heed::{Error as HeedError, MdbError}; use rayon::ThreadPoolBuildError; use rhai::EvalAltResult; @@ -61,6 +62,10 @@ pub enum InternalError { Serialization(#[from] SerializationError), #[error(transparent)] Store(#[from] MdbError), + #[error("Cannot delete {key:?} from database {database_name}: {error}")] + StoreDeletion { database_name: &'static str, key: BString, error: heed::Error }, + #[error("Cannot insert {key:?} and value with length {value_length} into database {database_name}: {error}")] + StorePut { database_name: &'static str, key: BString, value_length: usize, error: heed::Error }, #[error(transparent)] Utf8(#[from] str::Utf8Error), #[error("An indexation process was explicitly aborted")] @@ -109,7 +114,7 @@ pub enum UserError { "Document identifier `{}` is invalid. \ A document identifier can be of type integer or string, \ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_), \ -and can not be more than 512 bytes.", .document_id.to_string() +and can not be more than 511 bytes.", .document_id.to_string() )] InvalidDocumentId { document_id: Value }, #[error("Invalid facet distribution, {}", format_invalid_filter_distribution(.invalid_facets_name, .valid_facets_name))] diff --git a/crates/milli/src/heed_codec/facet/mod.rs b/crates/milli/src/heed_codec/facet/mod.rs index a8bb5055e..c0870c9fd 100644 --- a/crates/milli/src/heed_codec/facet/mod.rs +++ b/crates/milli/src/heed_codec/facet/mod.rs @@ -97,7 +97,7 @@ impl<'a> heed::BytesEncode<'a> for FacetGroupValueCodec { fn bytes_encode(value: &'a Self::EItem) -> Result, BoxedError> { let mut v = vec![value.size]; - CboRoaringBitmapCodec::serialize_into(&value.bitmap, &mut v); + CboRoaringBitmapCodec::serialize_into_vec(&value.bitmap, &mut v); Ok(Cow::Owned(v)) } } diff --git a/crates/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs b/crates/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs index 257d5bd0a..0ab162880 100644 --- a/crates/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs +++ b/crates/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs @@ -27,18 +27,27 @@ impl CboRoaringBitmapCodec { } } - pub fn serialize_into(roaring: &RoaringBitmap, vec: &mut Vec) { + pub fn serialize_into_vec(roaring: &RoaringBitmap, vec: &mut Vec) { + Self::serialize_into_writer(roaring, vec).unwrap() + } + + pub fn serialize_into_writer( + roaring: &RoaringBitmap, + mut writer: W, + ) -> io::Result<()> { if roaring.len() <= THRESHOLD as u64 { // If the number of items (u32s) to encode is less than or equal to the threshold // it means that it would weigh the same or less than the RoaringBitmap // header, so we directly encode them using ByteOrder instead. for integer in roaring { - vec.write_u32::(integer).unwrap(); + writer.write_u32::(integer)?; } } else { // Otherwise, we use the classic RoaringBitmapCodec that writes a header. - roaring.serialize_into(vec).unwrap(); + roaring.serialize_into(writer)?; } + + Ok(()) } pub fn deserialize_from(mut bytes: &[u8]) -> io::Result { @@ -143,7 +152,7 @@ impl CboRoaringBitmapCodec { return Ok(None); } - Self::serialize_into(&previous, buffer); + Self::serialize_into_vec(&previous, buffer); Ok(Some(&buffer[..])) } } @@ -169,7 +178,7 @@ impl heed::BytesEncode<'_> for CboRoaringBitmapCodec { fn bytes_encode(item: &Self::EItem) -> Result, BoxedError> { let mut vec = Vec::with_capacity(Self::serialized_size(item)); - Self::serialize_into(item, &mut vec); + Self::serialize_into_vec(item, &mut vec); Ok(Cow::Owned(vec)) } } diff --git a/crates/milli/src/index.rs b/crates/milli/src/index.rs index 89f965b7c..f60b59c72 100644 --- a/crates/milli/src/index.rs +++ b/crates/milli/src/index.rs @@ -70,6 +70,8 @@ pub mod main_key { pub const EMBEDDING_CONFIGS: &str = "embedding_configs"; pub const SEARCH_CUTOFF: &str = "search_cutoff"; pub const LOCALIZED_ATTRIBUTES_RULES: &str = "localized_attributes_rules"; + pub const FACET_SEARCH: &str = "facet_search"; + pub const PREFIX_SEARCH: &str = "prefix_search"; } pub mod db_name { @@ -1233,6 +1235,10 @@ impl Index { ) } + pub(crate) fn delete_words_prefixes_fst(&self, wtxn: &mut RwTxn<'_>) -> heed::Result { + self.main.remap_key_type::().delete(wtxn, main_key::WORDS_PREFIXES_FST_KEY) + } + /// Returns the FST which is the words prefixes dictionary of the engine. pub fn words_prefixes_fst<'t>(&self, rtxn: &'t RoTxn<'t>) -> Result>> { match self.main.remap_types::().get(rtxn, main_key::WORDS_PREFIXES_FST_KEY)? { @@ -1562,6 +1568,41 @@ impl Index { self.main.remap_key_type::().delete(txn, main_key::PROXIMITY_PRECISION) } + pub fn prefix_search(&self, txn: &RoTxn<'_>) -> heed::Result> { + self.main.remap_types::>().get(txn, main_key::PREFIX_SEARCH) + } + + pub(crate) fn put_prefix_search( + &self, + txn: &mut RwTxn<'_>, + val: PrefixSearch, + ) -> heed::Result<()> { + self.main.remap_types::>().put( + txn, + main_key::PREFIX_SEARCH, + &val, + ) + } + + pub(crate) fn delete_prefix_search(&self, txn: &mut RwTxn<'_>) -> heed::Result { + self.main.remap_key_type::().delete(txn, main_key::PREFIX_SEARCH) + } + + pub fn facet_search(&self, txn: &RoTxn<'_>) -> heed::Result { + self.main + .remap_types::>() + .get(txn, main_key::FACET_SEARCH) + .map(|v| v.unwrap_or(true)) + } + + pub(crate) fn put_facet_search(&self, txn: &mut RwTxn<'_>, val: bool) -> heed::Result<()> { + self.main.remap_types::>().put(txn, main_key::FACET_SEARCH, &val) + } + + pub(crate) fn delete_facet_search(&self, txn: &mut RwTxn<'_>) -> heed::Result { + self.main.remap_key_type::().delete(txn, main_key::FACET_SEARCH) + } + pub fn localized_attributes_rules( &self, rtxn: &RoTxn<'_>, @@ -1647,12 +1688,9 @@ impl Index { Ok(res) } - pub fn prefix_settings(&self, _rtxn: &RoTxn<'_>) -> Result { - Ok(PrefixSettings { - compute_prefixes: true, - max_prefix_length: 4, - prefix_count_threshold: 100, - }) + pub fn prefix_settings(&self, rtxn: &RoTxn<'_>) -> Result { + let compute_prefixes = self.prefix_search(rtxn)?.unwrap_or_default(); + Ok(PrefixSettings { compute_prefixes, max_prefix_length: 4, prefix_count_threshold: 100 }) } } @@ -1665,9 +1703,17 @@ pub struct IndexEmbeddingConfig { #[derive(Debug, Deserialize, Serialize)] pub struct PrefixSettings { - pub prefix_count_threshold: u64, + pub prefix_count_threshold: usize, pub max_prefix_length: usize, - pub compute_prefixes: bool, + pub compute_prefixes: PrefixSearch, +} + +#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Default)] +#[serde(rename_all = "camelCase")] +pub enum PrefixSearch { + #[default] + IndexingTime, + Disabled, } #[derive(Serialize, Deserialize)] @@ -1688,6 +1734,7 @@ pub(crate) mod tests { use crate::error::{Error, InternalError}; use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS}; + use crate::progress::Progress; use crate::update::new::indexer; use crate::update::settings::InnerIndexSettings; use crate::update::{ @@ -1764,7 +1811,7 @@ pub(crate) mod tests { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), )?; if let Some(error) = operation_stats.into_iter().find_map(|stat| stat.error) { @@ -1775,6 +1822,7 @@ pub(crate) mod tests { indexer::index( wtxn, &self.inner, + &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(), indexer_config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -1782,7 +1830,7 @@ pub(crate) mod tests { &document_changes, embedders, &|| false, - &|_| (), + &Progress::default(), ) }) .unwrap()?; @@ -1854,7 +1902,7 @@ pub(crate) mod tests { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), )?; if let Some(error) = operation_stats.into_iter().find_map(|stat| stat.error) { @@ -1865,6 +1913,7 @@ pub(crate) mod tests { indexer::index( wtxn, &self.inner, + &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(), indexer_config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -1872,7 +1921,7 @@ pub(crate) mod tests { &document_changes, embedders, &|| false, - &|_| (), + &Progress::default(), ) }) .unwrap()?; @@ -1934,7 +1983,7 @@ pub(crate) mod tests { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); @@ -1945,6 +1994,7 @@ pub(crate) mod tests { indexer::index( &mut wtxn, &index.inner, + &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(), indexer_config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -1952,7 +2002,7 @@ pub(crate) mod tests { &document_changes, embedders, &|| should_abort.load(Relaxed), - &|_| (), + &Progress::default(), ) }) .unwrap() diff --git a/crates/milli/src/lib.rs b/crates/milli/src/lib.rs index 48b03b6cc..3ae0bfdb9 100644 --- a/crates/milli/src/lib.rs +++ b/crates/milli/src/lib.rs @@ -1,6 +1,7 @@ #![cfg_attr(all(test, fuzzing), feature(no_coverage))] #![allow(clippy::type_complexity)] +#[cfg(not(windows))] #[cfg(test)] #[global_allocator] pub static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc; @@ -30,6 +31,7 @@ pub mod vector; #[macro_use] pub mod snapshot_tests; mod fieldids_weights_map; +pub mod progress; use std::collections::{BTreeMap, HashMap}; use std::convert::{TryFrom, TryInto}; diff --git a/crates/milli/src/progress.rs b/crates/milli/src/progress.rs new file mode 100644 index 000000000..accc2cf56 --- /dev/null +++ b/crates/milli/src/progress.rs @@ -0,0 +1,152 @@ +use std::any::TypeId; +use std::borrow::Cow; +use std::sync::atomic::{AtomicU32, Ordering}; +use std::sync::{Arc, RwLock}; + +use serde::Serialize; + +pub trait Step: 'static + Send + Sync { + fn name(&self) -> Cow<'static, str>; + fn current(&self) -> u32; + fn total(&self) -> u32; +} + +#[derive(Clone, Default)] +pub struct Progress { + steps: Arc)>>>, +} + +impl Progress { + pub fn update_progress(&self, sub_progress: P) { + let mut steps = self.steps.write().unwrap(); + let step_type = TypeId::of::

(); + if let Some(idx) = steps.iter().position(|(id, _)| *id == step_type) { + steps.truncate(idx); + } + steps.push((step_type, Box::new(sub_progress))); + } + + // TODO: This code should be in meilisearch_types but cannot because milli can't depend on meilisearch_types + pub fn as_progress_view(&self) -> ProgressView { + let steps = self.steps.read().unwrap(); + + let mut percentage = 0.0; + let mut prev_factors = 1.0; + + let mut step_view = Vec::with_capacity(steps.len()); + for (_, step) in steps.iter() { + prev_factors *= step.total() as f32; + percentage += step.current() as f32 / prev_factors; + + step_view.push(ProgressStepView { + current_step: step.name(), + finished: step.current(), + total: step.total(), + }); + } + + ProgressView { steps: step_view, percentage: percentage * 100.0 } + } +} + +/// This trait lets you use the AtomicSubStep defined right below. +/// The name must be a const that never changed but that can't be enforced by the type system because it make the trait non object-safe. +/// By forcing the Default trait + the &'static str we make it harder to miss-use the trait. +pub trait NamedStep: 'static + Send + Sync + Default { + fn name(&self) -> &'static str; +} + +/// Structure to quickly define steps that need very quick, lockless updating of their current step. +/// You can use this struct if: +/// - The name of the step doesn't change +/// - The total number of steps doesn't change +pub struct AtomicSubStep { + unit_name: Name, + current: Arc, + total: u32, +} + +impl AtomicSubStep { + pub fn new(total: u32) -> (Arc, Self) { + let current = Arc::new(AtomicU32::new(0)); + (current.clone(), Self { current, total, unit_name: Name::default() }) + } +} + +impl Step for AtomicSubStep { + fn name(&self) -> Cow<'static, str> { + self.unit_name.name().into() + } + + fn current(&self) -> u32 { + self.current.load(Ordering::Relaxed) + } + + fn total(&self) -> u32 { + self.total + } +} + +#[macro_export] +macro_rules! make_enum_progress { + ($visibility:vis enum $name:ident { $($variant:ident,)+ }) => { + #[repr(u8)] + #[derive(Debug, Clone, Copy, PartialEq, Eq, Sequence)] + #[allow(clippy::enum_variant_names)] + $visibility enum $name { + $($variant),+ + } + + impl Step for $name { + fn name(&self) -> Cow<'static, str> { + use convert_case::Casing; + + match self { + $( + $name::$variant => stringify!($variant).from_case(convert_case::Case::Camel).to_case(convert_case::Case::Lower).into() + ),+ + } + } + + fn current(&self) -> u32 { + *self as u32 + } + + fn total(&self) -> u32 { + Self::CARDINALITY as u32 + } + } + }; +} + +#[macro_export] +macro_rules! make_atomic_progress { + ($struct_name:ident alias $atomic_struct_name:ident => $step_name:literal) => { + #[derive(Default, Debug, Clone, Copy)] + pub struct $struct_name {} + impl NamedStep for $struct_name { + fn name(&self) -> &'static str { + $step_name + } + } + pub type $atomic_struct_name = AtomicSubStep<$struct_name>; + }; +} + +make_atomic_progress!(Document alias AtomicDocumentStep => "document" ); +make_atomic_progress!(Payload alias AtomicPayloadStep => "payload" ); + +#[derive(Debug, Serialize, Clone)] +#[serde(rename_all = "camelCase")] +pub struct ProgressView { + pub steps: Vec, + pub percentage: f32, +} + +#[derive(Debug, Serialize, Clone)] +#[serde(rename_all = "camelCase")] +pub struct ProgressStepView { + pub current_step: Cow<'static, str>, + pub finished: u32, + pub total: u32, +} diff --git a/crates/milli/src/prompt/document.rs b/crates/milli/src/prompt/document.rs index dea7946da..ae0a506ac 100644 --- a/crates/milli/src/prompt/document.rs +++ b/crates/milli/src/prompt/document.rs @@ -3,12 +3,13 @@ use std::collections::BTreeMap; use std::fmt::{self, Debug}; use bumpalo::Bump; +use bumparaw_collections::{RawMap, RawVec, Value}; use liquid::model::{ ArrayView, DisplayCow, KString, KStringCow, ObjectRender, ObjectSource, ScalarCow, State, Value as LiquidValue, }; use liquid::{ObjectView, ValueView}; -use raw_collections::{RawMap, RawVec}; +use rustc_hash::FxBuildHasher; use serde_json::value::RawValue; use crate::update::del_add::{DelAdd, KvReaderDelAdd}; @@ -195,7 +196,7 @@ impl<'doc, D: DocumentTrait<'doc> + Debug> ObjectView for ParseableDocument<'doc } impl<'doc, D: DocumentTrait<'doc> + Debug> ValueView for ParseableDocument<'doc, D> { - fn as_debug(&self) -> &dyn fmt::Debug { + fn as_debug(&self) -> &dyn Debug { self } fn render(&self) -> liquid::model::DisplayCow<'_> { @@ -243,14 +244,13 @@ impl<'doc, D: DocumentTrait<'doc> + Debug> ValueView for ParseableDocument<'doc, } } -#[derive(Debug)] struct ParseableValue<'doc> { - value: raw_collections::Value<'doc>, + value: Value<'doc, FxBuildHasher>, } impl<'doc> ParseableValue<'doc> { pub fn new(value: &'doc RawValue, doc_alloc: &'doc Bump) -> Self { - let value = raw_collections::Value::from_raw_value(value, doc_alloc).unwrap(); + let value = Value::from_raw_value_and_hasher(value, FxBuildHasher, doc_alloc).unwrap(); Self { value } } @@ -260,19 +260,19 @@ impl<'doc> ParseableValue<'doc> { } // transparent newtype for implementing ValueView -#[repr(transparent)] #[derive(Debug)] -struct ParseableMap<'doc>(RawMap<'doc>); +#[repr(transparent)] +struct ParseableMap<'doc>(RawMap<'doc, FxBuildHasher>); // transparent newtype for implementing ValueView -#[repr(transparent)] #[derive(Debug)] +#[repr(transparent)] struct ParseableArray<'doc>(RawVec<'doc>); impl<'doc> ParseableMap<'doc> { - pub fn as_parseable<'a>(map: &'a RawMap<'doc>) -> &'a ParseableMap<'doc> { + pub fn as_parseable<'a>(map: &'a RawMap<'doc, FxBuildHasher>) -> &'a ParseableMap<'doc> { // SAFETY: repr(transparent) - unsafe { &*(map as *const RawMap as *const Self) } + unsafe { &*(map as *const RawMap as *const Self) } } } @@ -447,8 +447,9 @@ impl<'doc> ValueView for ParseableValue<'doc> { } fn render(&self) -> DisplayCow<'_> { - use raw_collections::value::Number; - use raw_collections::Value; + use bumparaw_collections::value::Number; + use bumparaw_collections::Value; + match &self.value { Value::Null => LiquidValue::Nil.render(), Value::Bool(v) => v.render(), @@ -464,8 +465,9 @@ impl<'doc> ValueView for ParseableValue<'doc> { } fn source(&self) -> DisplayCow<'_> { - use raw_collections::value::Number; - use raw_collections::Value; + use bumparaw_collections::value::Number; + use bumparaw_collections::Value; + match &self.value { Value::Null => LiquidValue::Nil.source(), Value::Bool(v) => ValueView::source(v), @@ -481,8 +483,9 @@ impl<'doc> ValueView for ParseableValue<'doc> { } fn type_name(&self) -> &'static str { - use raw_collections::value::Number; - use raw_collections::Value; + use bumparaw_collections::value::Number; + use bumparaw_collections::Value; + match &self.value { Value::Null => LiquidValue::Nil.type_name(), Value::Bool(v) => v.type_name(), @@ -498,7 +501,8 @@ impl<'doc> ValueView for ParseableValue<'doc> { } fn query_state(&self, state: State) -> bool { - use raw_collections::Value; + use bumparaw_collections::Value; + match &self.value { Value::Null => ValueView::query_state(&LiquidValue::Nil, state), Value::Bool(v) => ValueView::query_state(v, state), @@ -515,7 +519,8 @@ impl<'doc> ValueView for ParseableValue<'doc> { } fn to_kstr(&self) -> KStringCow<'_> { - use raw_collections::Value; + use bumparaw_collections::Value; + match &self.value { Value::Null => ValueView::to_kstr(&LiquidValue::Nil), Value::Bool(v) => ValueView::to_kstr(v), @@ -527,12 +532,14 @@ impl<'doc> ValueView for ParseableValue<'doc> { } fn to_value(&self) -> LiquidValue { - use raw_collections::Value; + use bumparaw_collections::value::Number; + use bumparaw_collections::Value; + match &self.value { Value::Null => LiquidValue::Nil, Value::Bool(v) => LiquidValue::Scalar(liquid::model::ScalarCow::new(*v)), Value::Number(number) => match number { - raw_collections::value::Number::PosInt(number) => { + Number::PosInt(number) => { let number: i64 = match (*number).try_into() { Ok(number) => number, Err(_) => { @@ -541,12 +548,8 @@ impl<'doc> ValueView for ParseableValue<'doc> { }; LiquidValue::Scalar(ScalarCow::new(number)) } - raw_collections::value::Number::NegInt(number) => { - LiquidValue::Scalar(ScalarCow::new(*number)) - } - raw_collections::value::Number::Finite(number) => { - LiquidValue::Scalar(ScalarCow::new(*number)) - } + Number::NegInt(number) => LiquidValue::Scalar(ScalarCow::new(*number)), + Number::Finite(number) => LiquidValue::Scalar(ScalarCow::new(*number)), }, Value::String(s) => LiquidValue::Scalar(liquid::model::ScalarCow::new(s.to_string())), Value::Array(raw_vec) => ParseableArray::as_parseable(raw_vec).to_value(), @@ -555,8 +558,9 @@ impl<'doc> ValueView for ParseableValue<'doc> { } fn as_scalar(&self) -> Option> { - use raw_collections::value::Number; - use raw_collections::Value; + use bumparaw_collections::value::Number; + use bumparaw_collections::Value; + match &self.value { Value::Bool(v) => Some(liquid::model::ScalarCow::new(*v)), Value::Number(number) => match number { @@ -576,34 +580,41 @@ impl<'doc> ValueView for ParseableValue<'doc> { } fn is_scalar(&self) -> bool { - use raw_collections::Value; + use bumparaw_collections::Value; + matches!(&self.value, Value::Bool(_) | Value::Number(_) | Value::String(_)) } fn as_array(&self) -> Option<&dyn liquid::model::ArrayView> { - if let raw_collections::Value::Array(array) = &self.value { + if let Value::Array(array) = &self.value { return Some(ParseableArray::as_parseable(array) as _); } None } fn is_array(&self) -> bool { - matches!(&self.value, raw_collections::Value::Array(_)) + matches!(&self.value, bumparaw_collections::Value::Array(_)) } fn as_object(&self) -> Option<&dyn ObjectView> { - if let raw_collections::Value::Object(object) = &self.value { + if let Value::Object(object) = &self.value { return Some(ParseableMap::as_parseable(object) as _); } None } fn is_object(&self) -> bool { - matches!(&self.value, raw_collections::Value::Object(_)) + matches!(&self.value, bumparaw_collections::Value::Object(_)) } fn is_nil(&self) -> bool { - matches!(&self.value, raw_collections::Value::Null) + matches!(&self.value, bumparaw_collections::Value::Null) + } +} + +impl Debug for ParseableValue<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("ParseableValue").field("value", &self.value).finish() } } diff --git a/crates/milli/src/prompt/error.rs b/crates/milli/src/prompt/error.rs index 8a762b60a..a92e2fdc3 100644 --- a/crates/milli/src/prompt/error.rs +++ b/crates/milli/src/prompt/error.rs @@ -38,6 +38,16 @@ pub struct RenderPromptError { pub fault: FaultSource, } impl RenderPromptError { + pub(crate) fn missing_context_with_external_docid( + external_docid: String, + inner: liquid::Error, + ) -> RenderPromptError { + Self { + kind: RenderPromptErrorKind::MissingContextWithExternalDocid(external_docid, inner), + fault: FaultSource::User, + } + } + pub(crate) fn missing_context(inner: liquid::Error) -> RenderPromptError { Self { kind: RenderPromptErrorKind::MissingContext(inner), fault: FaultSource::User } } @@ -47,6 +57,8 @@ impl RenderPromptError { pub enum RenderPromptErrorKind { #[error("missing field in document: {0}")] MissingContext(liquid::Error), + #[error("missing field in document `{0}`: {1}")] + MissingContextWithExternalDocid(String, liquid::Error), } impl From for crate::Error { diff --git a/crates/milli/src/prompt/mod.rs b/crates/milli/src/prompt/mod.rs index bbcf054e6..3eb91611e 100644 --- a/crates/milli/src/prompt/mod.rs +++ b/crates/milli/src/prompt/mod.rs @@ -119,6 +119,7 @@ impl Prompt { 'doc: 'a, // lifetime of the allocator, will live for an entire chunk of documents >( &self, + external_docid: &str, document: impl crate::update::new::document::Document<'a> + Debug, field_id_map: &RefCell, doc_alloc: &'doc Bump, @@ -130,9 +131,12 @@ impl Prompt { self.max_bytes.unwrap_or_else(default_max_bytes).get(), doc_alloc, ); - self.template - .render_to(&mut rendered, &context) - .map_err(RenderPromptError::missing_context)?; + self.template.render_to(&mut rendered, &context).map_err(|liquid_error| { + RenderPromptError::missing_context_with_external_docid( + external_docid.to_owned(), + liquid_error, + ) + })?; Ok(std::str::from_utf8(rendered.into_bump_slice()) .expect("render can only write UTF-8 because all inputs and processing preserve utf-8")) } diff --git a/crates/milli/src/search/hybrid.rs b/crates/milli/src/search/hybrid.rs index 5187b572b..368d61833 100644 --- a/crates/milli/src/search/hybrid.rs +++ b/crates/milli/src/search/hybrid.rs @@ -207,7 +207,11 @@ impl<'a> Search<'a> { Ok(embedding) => embedding, Err(error) => { tracing::error!(error=%error, "Embedding failed"); - return Ok((keyword_results, Some(0))); + return Ok(return_keyword_results( + self.limit, + self.offset, + keyword_results, + )); } } } diff --git a/crates/milli/src/search/new/matches/mod.rs b/crates/milli/src/search/new/matches/mod.rs index 3f5c60efd..19c1127cd 100644 --- a/crates/milli/src/search/new/matches/mod.rs +++ b/crates/milli/src/search/new/matches/mod.rs @@ -274,7 +274,7 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { last_match_last_token_position_plus_one } else { // we have matched the end of possible tokens, there's nothing to advance - tokens.len() - 1 + tokens.len() } }; diff --git a/crates/milli/src/search/new/mod.rs b/crates/milli/src/search/new/mod.rs index f7c590360..4edcd09de 100644 --- a/crates/milli/src/search/new/mod.rs +++ b/crates/milli/src/search/new/mod.rs @@ -49,6 +49,7 @@ pub use self::geo_sort::Strategy as GeoSortStrategy; use self::graph_based_ranking_rule::Words; use self::interner::Interned; use self::vector_sort::VectorSort; +use crate::index::PrefixSearch; use crate::localized_attributes_rules::LocalizedFieldIds; use crate::score_details::{ScoreDetails, ScoringStrategy}; use crate::search::new::distinct::apply_distinct_rule; @@ -68,6 +69,7 @@ pub struct SearchContext<'ctx> { pub term_interner: Interner, pub phrase_docids: PhraseDocIdsCache, pub restricted_fids: Option, + pub prefix_search: PrefixSearch, } impl<'ctx> SearchContext<'ctx> { @@ -85,6 +87,8 @@ impl<'ctx> SearchContext<'ctx> { } } + let prefix_search = index.prefix_search(txn)?.unwrap_or_default(); + Ok(Self { index, txn, @@ -94,9 +98,14 @@ impl<'ctx> SearchContext<'ctx> { term_interner: <_>::default(), phrase_docids: <_>::default(), restricted_fids: None, + prefix_search, }) } + pub fn is_prefix_search_allowed(&self) -> bool { + self.prefix_search != PrefixSearch::Disabled + } + pub fn attributes_to_search_on( &mut self, attributes_to_search_on: &'ctx [String], diff --git a/crates/milli/src/search/new/query_term/parse_query.rs b/crates/milli/src/search/new/query_term/parse_query.rs index bb98f19ce..a76fd6525 100644 --- a/crates/milli/src/search/new/query_term/parse_query.rs +++ b/crates/milli/src/search/new/query_term/parse_query.rs @@ -28,6 +28,7 @@ pub fn located_query_terms_from_tokens( words_limit: Option, ) -> Result { let nbr_typos = number_of_typos_allowed(ctx)?; + let allow_prefix_search = ctx.is_prefix_search_allowed(); let mut query_terms = Vec::new(); @@ -94,7 +95,7 @@ pub fn located_query_terms_from_tokens( ctx, word, nbr_typos(word), - true, + allow_prefix_search, false, )?; let located_term = LocatedQueryTerm { diff --git a/crates/milli/src/search/new/resolve_query_graph.rs b/crates/milli/src/search/new/resolve_query_graph.rs index 7a47b0a66..4496f8c65 100644 --- a/crates/milli/src/search/new/resolve_query_graph.rs +++ b/crates/milli/src/search/new/resolve_query_graph.rs @@ -193,15 +193,23 @@ pub fn compute_phrase_docids( if words.is_empty() { return Ok(RoaringBitmap::new()); } - let mut candidates = RoaringBitmap::new(); + let mut candidates = None; for word in words.iter().flatten().copied() { if let Some(word_docids) = ctx.word_docids(None, Word::Original(word))? { - candidates |= word_docids; + if let Some(candidates) = candidates.as_mut() { + *candidates &= word_docids; + } else { + candidates = Some(word_docids); + } } else { return Ok(RoaringBitmap::new()); } } + let Some(mut candidates) = candidates else { + return Ok(RoaringBitmap::new()); + }; + let winsize = words.len().min(3); for win in words.windows(winsize) { diff --git a/crates/milli/src/search/new/tests/integration.rs b/crates/milli/src/search/new/tests/integration.rs index 79668b34b..04d3b6667 100644 --- a/crates/milli/src/search/new/tests/integration.rs +++ b/crates/milli/src/search/new/tests/integration.rs @@ -5,6 +5,7 @@ use bumpalo::Bump; use heed::EnvOpenOptions; use maplit::{btreemap, hashset}; +use crate::progress::Progress; use crate::update::new::indexer; use crate::update::{IndexDocumentsMethod, IndexerConfig, Settings}; use crate::vector::EmbeddingConfigs; @@ -72,7 +73,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); @@ -83,6 +84,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { indexer::index( &mut wtxn, &index, + &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -90,7 +92,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { &document_changes, embedders, &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); diff --git a/crates/milli/src/update/facet/mod.rs b/crates/milli/src/update/facet/mod.rs index 2e592519b..911296577 100644 --- a/crates/milli/src/update/facet/mod.rs +++ b/crates/milli/src/update/facet/mod.rs @@ -172,6 +172,14 @@ impl<'i> FacetsUpdate<'i> { incremental_update.execute(wtxn)?; } + if !self.index.facet_search(wtxn)? { + // If facet search is disabled, we don't need to compute facet search databases. + // We clear the facet search databases. + self.index.facet_id_string_fst.clear(wtxn)?; + self.index.facet_id_normalized_string_strings.clear(wtxn)?; + return Ok(()); + } + match self.normalized_delta_data { Some(data) => index_facet_search(wtxn, data, self.index), None => Ok(()), diff --git a/crates/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/crates/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index b1e6f24be..606ae6b54 100644 --- a/crates/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/crates/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -58,9 +58,9 @@ pub fn extract_docid_word_positions( .map(|s| s.iter().map(String::as_str).collect()); let old_dictionary: Option> = settings_diff.old.dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect()); - let del_builder = + let mut del_builder = tokenizer_builder(old_stop_words, old_separators.as_deref(), old_dictionary.as_deref()); - let del_tokenizer = del_builder.into_tokenizer(); + let del_tokenizer = del_builder.build(); let new_stop_words = settings_diff.new.stop_words.as_ref(); let new_separators: Option> = settings_diff @@ -70,9 +70,9 @@ pub fn extract_docid_word_positions( .map(|s| s.iter().map(String::as_str).collect()); let new_dictionary: Option> = settings_diff.new.dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect()); - let add_builder = + let mut add_builder = tokenizer_builder(new_stop_words, new_separators.as_deref(), new_dictionary.as_deref()); - let add_tokenizer = add_builder.into_tokenizer(); + let add_tokenizer = add_builder.build(); // iterate over documents. let mut cursor = obkv_documents.into_cursor()?; diff --git a/crates/milli/src/update/index_documents/extract/extract_facet_string_docids.rs b/crates/milli/src/update/index_documents/extract/extract_facet_string_docids.rs index e0d7e1386..d330ea5a0 100644 --- a/crates/milli/src/update/index_documents/extract/extract_facet_string_docids.rs +++ b/crates/milli/src/update/index_documents/extract/extract_facet_string_docids.rs @@ -34,10 +34,12 @@ pub fn extract_facet_string_docids( extract_facet_string_docids_settings(docid_fid_facet_string, indexer, settings_diff) } else { let localized_field_ids = &settings_diff.new.localized_faceted_fields_ids; + let facet_search = settings_diff.new.facet_search; extract_facet_string_docids_document_update( docid_fid_facet_string, indexer, localized_field_ids, + facet_search, ) } } @@ -51,6 +53,7 @@ fn extract_facet_string_docids_document_update( docid_fid_facet_string: grenad::Reader, indexer: GrenadParameters, localized_field_ids: &LocalizedFieldIds, + facet_search: bool, ) -> Result<(grenad::Reader>, grenad::Reader>)> { let max_memory = indexer.max_memory_by_thread(); @@ -96,7 +99,7 @@ fn extract_facet_string_docids_document_update( let normalized_value = str::from_utf8(normalized_value_bytes)?; // Facet search normalization - { + if facet_search { let locales = localized_field_ids.locales(field_id); let hyper_normalized_value = normalize_facet_string(normalized_value, locales); @@ -179,8 +182,10 @@ fn extract_facet_string_docids_settings( let new_locales = settings_diff.new.localized_faceted_fields_ids.locales(field_id); let are_same_locales = old_locales == new_locales; + let reindex_facet_search = + settings_diff.new.facet_search && !settings_diff.old.facet_search; - if is_same_value && are_same_locales { + if is_same_value && are_same_locales && !reindex_facet_search { continue; } @@ -191,18 +196,26 @@ fn extract_facet_string_docids_settings( let normalized_value = str::from_utf8(normalized_value_bytes)?; // Facet search normalization - { - let old_hyper_normalized_value = normalize_facet_string(normalized_value, old_locales); - let new_hyper_normalized_value = if are_same_locales { - &old_hyper_normalized_value + if settings_diff.new.facet_search { + let new_hyper_normalized_value = normalize_facet_string(normalized_value, new_locales); + let old_hyper_normalized_value; + let old_hyper_normalized_value = if !settings_diff.old.facet_search + || deladd_reader.get(DelAdd::Deletion).is_none() + { + // if the facet search is disabled in the old settings or if no facet string is deleted, + // we don't need to normalize the facet string. + None + } else if are_same_locales { + Some(&new_hyper_normalized_value) } else { - &normalize_facet_string(normalized_value, new_locales) + old_hyper_normalized_value = normalize_facet_string(normalized_value, old_locales); + Some(&old_hyper_normalized_value) }; let set = BTreeSet::from_iter(std::iter::once(normalized_value)); // if the facet string is the same, we can put the deletion and addition in the same obkv. - if old_hyper_normalized_value == new_hyper_normalized_value.as_str() { + if old_hyper_normalized_value == Some(&new_hyper_normalized_value) { // nothing to do if we delete and re-add the value. if is_same_value { continue; @@ -222,7 +235,7 @@ fn extract_facet_string_docids_settings( } else { // if the facet string is different, we need to insert the deletion and addition in different obkv because the related key is different. // deletion - if deladd_reader.get(DelAdd::Deletion).is_some() { + if let Some(old_hyper_normalized_value) = old_hyper_normalized_value { // insert old value let val = SerdeJson::bytes_encode(&set).map_err(heed::Error::Encoding)?; buffer.clear(); diff --git a/crates/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/crates/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs index 047669521..88c02fe70 100644 --- a/crates/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs +++ b/crates/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs @@ -80,7 +80,7 @@ pub fn extract_fid_docid_facet_values( let new_faceted_fids: BTreeSet<_> = settings_diff.new.faceted_fields_ids.iter().copied().collect(); - if !settings_diff.settings_update_only || old_faceted_fids != new_faceted_fids { + if !settings_diff.settings_update_only || settings_diff.reindex_facets() { let mut cursor = obkv_documents.into_cursor()?; while let Some((docid_bytes, value)) = cursor.move_on_next()? { let obkv = obkv::KvReader::from_slice(value); @@ -112,8 +112,10 @@ pub fn extract_fid_docid_facet_values( (field_id, None, add_value) } EitherOrBoth::Both(&field_id, _) => { - // during settings update, recompute the changing settings only. - if settings_diff.settings_update_only { + // during settings update, recompute the changing settings only unless a global change is detected. + if settings_diff.settings_update_only + && !settings_diff.global_facet_settings_changed() + { continue; } diff --git a/crates/milli/src/update/index_documents/mod.rs b/crates/milli/src/update/index_documents/mod.rs index baecbdcf0..bae8e00b4 100644 --- a/crates/milli/src/update/index_documents/mod.rs +++ b/crates/milli/src/update/index_documents/mod.rs @@ -29,6 +29,7 @@ pub use self::transform::{Transform, TransformOutput}; use super::new::StdResult; use crate::documents::{obkv_to_object, DocumentsBatchReader}; use crate::error::{Error, InternalError}; +use crate::index::{PrefixSearch, PrefixSettings}; use crate::thread_pool_no_abort::ThreadPoolNoAbortBuilder; pub use crate::update::index_documents::helpers::CursorClonableMmap; use crate::update::{ @@ -82,8 +83,6 @@ pub struct IndexDocuments<'t, 'i, 'a, FP, FA> { #[derive(Default, Debug, Clone)] pub struct IndexDocumentsConfig { - pub words_prefix_threshold: Option, - pub max_prefix_length: Option, pub words_positions_level_group_size: Option, pub words_positions_min_level_size: Option, pub update_method: IndexDocumentsMethod, @@ -565,14 +564,32 @@ where self.index.words_prefixes_fst(self.wtxn)?.map_data(|cow| cow.into_owned())?; // Run the words prefixes update operation. - let mut builder = WordsPrefixesFst::new(self.wtxn, self.index); - if let Some(value) = self.config.words_prefix_threshold { - builder.threshold(value); + let PrefixSettings { prefix_count_threshold, max_prefix_length, compute_prefixes } = + self.index.prefix_settings(self.wtxn)?; + + // If the prefix search is enabled at indexing time, we compute the prefixes. + if compute_prefixes == PrefixSearch::IndexingTime { + let mut builder = WordsPrefixesFst::new(self.wtxn, self.index); + builder.threshold(prefix_count_threshold); + builder.max_prefix_length(max_prefix_length); + builder.execute()?; + } else { + // If the prefix search is disabled at indexing time, we delete the previous words prefixes fst. + // And all the associated docids databases. + self.index.delete_words_prefixes_fst(self.wtxn)?; + self.index.word_prefix_docids.clear(self.wtxn)?; + self.index.exact_word_prefix_docids.clear(self.wtxn)?; + self.index.word_prefix_position_docids.clear(self.wtxn)?; + self.index.word_prefix_fid_docids.clear(self.wtxn)?; + + databases_seen += 3; + (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase { + databases_seen, + total_databases: TOTAL_POSTING_DATABASE_COUNT, + }); + + return Ok(()); } - if let Some(value) = self.config.max_prefix_length { - builder.max_prefix_length(value); - } - builder.execute()?; if (self.should_abort)() { return Err(Error::InternalError(InternalError::AbortedIndexation)); @@ -749,6 +766,7 @@ mod tests { use crate::documents::mmap_from_objects; use crate::index::tests::TempIndex; use crate::index::IndexEmbeddingConfig; + use crate::progress::Progress; use crate::search::TermsMatchingStrategy; use crate::update::new::indexer; use crate::update::Setting; @@ -1947,7 +1965,7 @@ mod tests { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); @@ -2131,13 +2149,14 @@ mod tests { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); indexer::index( &mut wtxn, &index.inner, + &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(), indexer_config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -2145,7 +2164,7 @@ mod tests { &document_changes, embedders, &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); wtxn.commit().unwrap(); @@ -2192,13 +2211,14 @@ mod tests { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); indexer::index( &mut wtxn, &index.inner, + &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(), indexer_config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -2206,7 +2226,7 @@ mod tests { &document_changes, embedders, &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); wtxn.commit().unwrap(); @@ -2244,13 +2264,14 @@ mod tests { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); indexer::index( &mut wtxn, &index.inner, + &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(), indexer_config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -2258,7 +2279,7 @@ mod tests { &document_changes, embedders, &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); wtxn.commit().unwrap(); @@ -2295,13 +2316,14 @@ mod tests { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); indexer::index( &mut wtxn, &index.inner, + &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(), indexer_config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -2309,7 +2331,7 @@ mod tests { &document_changes, embedders, &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); wtxn.commit().unwrap(); @@ -2348,13 +2370,14 @@ mod tests { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); indexer::index( &mut wtxn, &index.inner, + &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(), indexer_config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -2362,7 +2385,7 @@ mod tests { &document_changes, embedders, &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); wtxn.commit().unwrap(); @@ -2406,13 +2429,14 @@ mod tests { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); indexer::index( &mut wtxn, &index.inner, + &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(), indexer_config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -2420,7 +2444,7 @@ mod tests { &document_changes, embedders, &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); wtxn.commit().unwrap(); @@ -2457,13 +2481,14 @@ mod tests { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); indexer::index( &mut wtxn, &index.inner, + &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(), indexer_config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -2471,7 +2496,7 @@ mod tests { &document_changes, embedders, &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); wtxn.commit().unwrap(); @@ -2508,13 +2533,14 @@ mod tests { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); indexer::index( &mut wtxn, &index.inner, + &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(), indexer_config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -2522,7 +2548,7 @@ mod tests { &document_changes, embedders, &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); wtxn.commit().unwrap(); @@ -2701,13 +2727,14 @@ mod tests { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); indexer::index( &mut wtxn, &index.inner, + &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(), indexer_config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -2715,7 +2742,7 @@ mod tests { &document_changes, embedders, &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); wtxn.commit().unwrap(); @@ -2759,13 +2786,14 @@ mod tests { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); indexer::index( &mut wtxn, &index.inner, + &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(), indexer_config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -2773,7 +2801,7 @@ mod tests { &document_changes, embedders, &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); wtxn.commit().unwrap(); @@ -2814,13 +2842,14 @@ mod tests { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); indexer::index( &mut wtxn, &index.inner, + &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(), indexer_config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -2828,7 +2857,7 @@ mod tests { &document_changes, embedders, &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); wtxn.commit().unwrap(); diff --git a/crates/milli/src/update/index_documents/transform.rs b/crates/milli/src/update/index_documents/transform.rs index 38bf90435..7477b5667 100644 --- a/crates/milli/src/update/index_documents/transform.rs +++ b/crates/milli/src/update/index_documents/transform.rs @@ -667,14 +667,23 @@ impl<'a, 'i> Transform<'a, 'i> { let is_primary_key = |id: FieldId| -> bool { settings_diff.primary_key_id == Some(id) }; // If only a faceted field has been added, keep only this field. - let must_reindex_facets = settings_diff.reindex_facets(); - let necessary_faceted_field = |id: FieldId| -> bool { - let field_name = settings_diff.new.fields_ids_map.name(id).unwrap(); - must_reindex_facets - && modified_faceted_fields - .iter() - .any(|long| is_faceted_by(long, field_name) || is_faceted_by(field_name, long)) - }; + let global_facet_settings_changed = settings_diff.global_facet_settings_changed(); + let facet_fids_changed = settings_diff.facet_fids_changed(); + let necessary_faceted_field = + |id: FieldId| -> bool { + let field_name = settings_diff.new.fields_ids_map.name(id).unwrap(); + if global_facet_settings_changed { + settings_diff.new.user_defined_faceted_fields.iter().any(|long| { + is_faceted_by(long, field_name) || is_faceted_by(field_name, long) + }) + } else if facet_fids_changed { + modified_faceted_fields.iter().any(|long| { + is_faceted_by(long, field_name) || is_faceted_by(field_name, long) + }) + } else { + false + } + }; // Alway provide all fields when vectors are involved because // we need the fields for the prompt/templating. diff --git a/crates/milli/src/update/new/channel.rs b/crates/milli/src/update/new/channel.rs index 3afcd3e4b..7590c02ac 100644 --- a/crates/milli/src/update/new/channel.rs +++ b/crates/milli/src/update/new/channel.rs @@ -1,104 +1,347 @@ +use std::cell::RefCell; +use std::io::{self, BufWriter}; +use std::iter::Cycle; use std::marker::PhantomData; -use std::sync::atomic::{AtomicUsize, Ordering}; +use std::mem; +use std::num::NonZeroU16; +use std::ops::Range; +use std::time::Duration; -use crossbeam_channel::{IntoIter, Receiver, SendError, Sender}; +use bbqueue::framed::{FrameGrantR, FrameProducer}; +use bbqueue::BBBuffer; +use bytemuck::{checked, CheckedBitPattern, NoUninit}; +use flume::{RecvTimeoutError, SendError}; use heed::types::Bytes; -use heed::BytesDecode; -use memmap2::Mmap; +use heed::{BytesDecode, MdbError}; +use memmap2::{Mmap, MmapMut}; use roaring::RoaringBitmap; use super::extract::FacetKind; +use super::ref_cell_ext::RefCellExt; +use super::thread_local::{FullySend, ThreadLocal}; use super::StdResult; use crate::heed_codec::facet::{FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec}; +use crate::index::db_name; use crate::index::main_key::{GEO_FACETED_DOCUMENTS_IDS_KEY, GEO_RTREE_KEY}; -use crate::index::IndexEmbeddingConfig; use crate::update::new::KvReaderFieldId; use crate::vector::Embedding; -use crate::{DocumentId, Index}; +use crate::{CboRoaringBitmapCodec, DocumentId, Error, Index, InternalError}; -/// The capacity of the channel is currently in number of messages. -pub fn extractor_writer_channel(cap: usize) -> (ExtractorSender, WriterReceiver) { - let (sender, receiver) = crossbeam_channel::bounded(cap); - ( - ExtractorSender { - sender, - send_count: Default::default(), - writer_contentious_count: Default::default(), - extractor_contentious_count: Default::default(), - }, - WriterReceiver(receiver), - ) +/// Creates a tuple of senders/receiver to be used by +/// the extractors and the writer loop. +/// +/// The `total_bbbuffer_capacity` represents the number of bytes +/// allocated to all BBQueue buffers. It will be split by the +/// number of threads. +/// +/// The `channel_capacity` parameter defines the number of +/// too-large-to-fit-in-BBQueue entries that can be sent through +/// a flume channel. This parameter must stay low to make +/// sure we do not use too much memory. +/// +/// Note that the channel is also used to wake-up the receiver +/// when new stuff is available in any BBQueue buffer but we send +/// a message in this queue only if it is empty to avoid filling +/// the channel *and* the BBQueue. +pub fn extractor_writer_bbqueue( + bbbuffers: &mut Vec, + total_bbbuffer_capacity: usize, + channel_capacity: usize, +) -> (ExtractorBbqueueSender, WriterBbqueueReceiver) { + let current_num_threads = rayon::current_num_threads(); + let bbbuffer_capacity = total_bbbuffer_capacity.checked_div(current_num_threads).unwrap(); + bbbuffers.resize_with(current_num_threads, || BBBuffer::new(bbbuffer_capacity)); + + let capacity = bbbuffers.first().unwrap().capacity(); + // Read the field description to understand this + let capacity = capacity.checked_sub(9).unwrap(); + + let producers = ThreadLocal::with_capacity(bbbuffers.len()); + let consumers = rayon::broadcast(|bi| { + let bbqueue = &bbbuffers[bi.index()]; + let (producer, consumer) = bbqueue.try_split_framed().unwrap(); + producers.get_or(|| FullySend(RefCell::new(producer))); + consumer + }); + + let (sender, receiver) = flume::bounded(channel_capacity); + let sender = ExtractorBbqueueSender { sender, producers, capacity }; + let receiver = WriterBbqueueReceiver { + receiver, + look_at_consumer: (0..consumers.len()).cycle(), + consumers, + }; + (sender, receiver) } -pub enum KeyValueEntry { - Small { key_length: usize, data: Box<[u8]> }, - Large { key_entry: KeyEntry, data: Mmap }, +pub struct ExtractorBbqueueSender<'a> { + /// This channel is used to wake-up the receiver and + /// send large entries that cannot fit in the BBQueue. + sender: flume::Sender, + /// A memory buffer, one by thread, is used to serialize + /// the entries directly in this shared, lock-free space. + producers: ThreadLocal>>>, + /// The capacity of this frame producer, will never be able to store more than that. + /// + /// Note that the FrameProducer requires up to 9 bytes to encode the length, + /// the capacity has been shrunk accordingly. + /// + /// + capacity: usize, } -impl KeyValueEntry { - pub fn from_small_key_value(key: &[u8], value: &[u8]) -> Self { - let mut data = Vec::with_capacity(key.len() + value.len()); - data.extend_from_slice(key); - data.extend_from_slice(value); - KeyValueEntry::Small { key_length: key.len(), data: data.into_boxed_slice() } - } - - fn from_large_key_value(key: &[u8], value: Mmap) -> Self { - KeyValueEntry::Large { key_entry: KeyEntry::from_key(key), data: value } - } - - pub fn key(&self) -> &[u8] { - match self { - KeyValueEntry::Small { key_length, data } => &data[..*key_length], - KeyValueEntry::Large { key_entry, data: _ } => key_entry.entry(), - } - } - - pub fn value(&self) -> &[u8] { - match self { - KeyValueEntry::Small { key_length, data } => &data[*key_length..], - KeyValueEntry::Large { key_entry: _, data } => &data[..], - } - } -} - -pub struct KeyEntry { - data: Box<[u8]>, -} - -impl KeyEntry { - pub fn from_key(key: &[u8]) -> Self { - KeyEntry { data: key.to_vec().into_boxed_slice() } - } - - pub fn entry(&self) -> &[u8] { - self.data.as_ref() - } -} - -pub enum EntryOperation { - Delete(KeyEntry), - Write(KeyValueEntry), -} - -pub enum WriterOperation { - DbOperation(DbOperation), - ArroyOperation(ArroyOperation), -} - -pub enum ArroyOperation { - DeleteVectors { docid: DocumentId }, - SetVectors { docid: DocumentId, embedder_id: u8, embeddings: Vec }, - SetVector { docid: DocumentId, embedder_id: u8, embedding: Embedding }, - Finish { configs: Vec }, -} - -pub struct DbOperation { - database: Database, - entry: EntryOperation, +pub struct WriterBbqueueReceiver<'a> { + /// Used to wake up when new entries are available either in + /// any BBQueue buffer or directly sent throught this channel + /// (still written to disk). + receiver: flume::Receiver, + /// Indicates the consumer to observe. This cycling range + /// ensures fair distribution of work among consumers. + look_at_consumer: Cycle>, + /// The BBQueue frames to read when waking-up. + consumers: Vec>, } +/// The action to perform on the receiver/writer side. #[derive(Debug)] +pub enum ReceiverAction { + /// Wake up, you have frames to read for the BBQueue buffers. + WakeUp, + LargeEntry(LargeEntry), + LargeVectors(LargeVectors), +} + +/// An entry that cannot fit in the BBQueue buffers has been +/// written to disk, memory-mapped and must be written in the +/// database. +#[derive(Debug)] +pub struct LargeEntry { + /// The database where the entry must be written. + pub database: Database, + /// The key of the entry that must be written in the database. + pub key: Box<[u8]>, + /// The large value that must be written. + /// + /// Note: We can probably use a `File` here and + /// use `Database::put_reserved` instead of memory-mapping. + pub value: Mmap, +} + +/// When embeddings are larger than the available +/// BBQueue space it arrives here. +#[derive(Debug)] +pub struct LargeVectors { + /// The document id associated to the large embedding. + pub docid: DocumentId, + /// The embedder id in which to insert the large embedding. + pub embedder_id: u8, + /// The large embedding that must be written. + pub embeddings: Mmap, +} + +impl LargeVectors { + pub fn read_embeddings(&self, dimensions: usize) -> impl Iterator { + self.embeddings.chunks_exact(dimensions).map(bytemuck::cast_slice) + } +} + +impl<'a> WriterBbqueueReceiver<'a> { + /// Tries to receive an action to do until the timeout occurs + /// and if it does, consider it as a spurious wake up. + pub fn recv_action(&mut self) -> Option { + match self.receiver.recv_timeout(Duration::from_millis(100)) { + Ok(action) => Some(action), + Err(RecvTimeoutError::Timeout) => Some(ReceiverAction::WakeUp), + Err(RecvTimeoutError::Disconnected) => None, + } + } + + /// Reads all the BBQueue buffers and selects the first available frame. + pub fn recv_frame(&mut self) -> Option> { + for index in self.look_at_consumer.by_ref().take(self.consumers.len()) { + if let Some(frame) = self.consumers[index].read() { + return Some(FrameWithHeader::from(frame)); + } + } + None + } +} + +pub struct FrameWithHeader<'a> { + header: EntryHeader, + frame: FrameGrantR<'a>, +} + +impl FrameWithHeader<'_> { + pub fn header(&self) -> EntryHeader { + self.header + } + + pub fn frame(&self) -> &FrameGrantR<'_> { + &self.frame + } +} + +impl<'a> From> for FrameWithHeader<'a> { + fn from(mut frame: FrameGrantR<'a>) -> Self { + frame.auto_release(true); + FrameWithHeader { header: EntryHeader::from_slice(&frame[..]), frame } + } +} + +/// A header that is written at the beginning of a bbqueue frame. +/// +/// Note that the different variants cannot be changed without taking +/// care of their size in the implementation, like, everywhere. +#[derive(Debug, Clone, Copy)] +#[repr(u8)] +pub enum EntryHeader { + DbOperation(DbOperation), + ArroyDeleteVector(ArroyDeleteVector), + ArroySetVectors(ArroySetVectors), +} + +impl EntryHeader { + const fn variant_size() -> usize { + mem::size_of::() + } + + const fn variant_id(&self) -> u8 { + match self { + EntryHeader::DbOperation(_) => 0, + EntryHeader::ArroyDeleteVector(_) => 1, + EntryHeader::ArroySetVectors(_) => 2, + } + } + + const fn total_key_value_size(key_length: NonZeroU16, value_length: usize) -> usize { + Self::variant_size() + + mem::size_of::() + + key_length.get() as usize + + value_length + } + + const fn total_key_size(key_length: NonZeroU16) -> usize { + Self::total_key_value_size(key_length, 0) + } + + const fn total_delete_vector_size() -> usize { + Self::variant_size() + mem::size_of::() + } + + /// The `dimensions` corresponds to the number of `f32` in the embedding. + fn total_set_vectors_size(count: usize, dimensions: usize) -> usize { + let embedding_size = dimensions * mem::size_of::(); + Self::variant_size() + mem::size_of::() + embedding_size * count + } + + fn header_size(&self) -> usize { + let payload_size = match self { + EntryHeader::DbOperation(op) => mem::size_of_val(op), + EntryHeader::ArroyDeleteVector(adv) => mem::size_of_val(adv), + EntryHeader::ArroySetVectors(asvs) => mem::size_of_val(asvs), + }; + Self::variant_size() + payload_size + } + + fn from_slice(slice: &[u8]) -> EntryHeader { + let (variant_id, remaining) = slice.split_first().unwrap(); + match variant_id { + 0 => { + let header_bytes = &remaining[..mem::size_of::()]; + let header = checked::pod_read_unaligned(header_bytes); + EntryHeader::DbOperation(header) + } + 1 => { + let header_bytes = &remaining[..mem::size_of::()]; + let header = checked::pod_read_unaligned(header_bytes); + EntryHeader::ArroyDeleteVector(header) + } + 2 => { + let header_bytes = &remaining[..mem::size_of::()]; + let header = checked::pod_read_unaligned(header_bytes); + EntryHeader::ArroySetVectors(header) + } + id => panic!("invalid variant id: {id}"), + } + } + + fn serialize_into(&self, header_bytes: &mut [u8]) { + let (first, remaining) = header_bytes.split_first_mut().unwrap(); + let payload_bytes = match self { + EntryHeader::DbOperation(op) => bytemuck::bytes_of(op), + EntryHeader::ArroyDeleteVector(adv) => bytemuck::bytes_of(adv), + EntryHeader::ArroySetVectors(asvs) => bytemuck::bytes_of(asvs), + }; + *first = self.variant_id(); + remaining.copy_from_slice(payload_bytes); + } +} + +#[derive(Debug, Clone, Copy, NoUninit, CheckedBitPattern)] +#[repr(C)] +/// Wether a put of the key/value pair or a delete of the given key. +pub struct DbOperation { + /// The database on which to perform the operation. + pub database: Database, + /// The key length in the buffer. + /// + /// If None it means that the buffer is dedicated + /// to the key and it is therefore a deletion operation. + pub key_length: Option, +} + +impl DbOperation { + pub fn key_value<'a>(&self, frame: &'a FrameGrantR<'_>) -> (&'a [u8], Option<&'a [u8]>) { + let skip = EntryHeader::variant_size() + mem::size_of::(); + match self.key_length { + Some(key_length) => { + let (key, value) = frame[skip..].split_at(key_length.get() as usize); + (key, Some(value)) + } + None => (&frame[skip..], None), + } + } +} + +#[derive(Debug, Clone, Copy, NoUninit, CheckedBitPattern)] +#[repr(transparent)] +pub struct ArroyDeleteVector { + pub docid: DocumentId, +} + +#[derive(Debug, Clone, Copy, NoUninit, CheckedBitPattern)] +#[repr(C)] +/// The embeddings are in the remaining space and represents +/// non-aligned [f32] each with dimensions f32s. +pub struct ArroySetVectors { + pub docid: DocumentId, + pub embedder_id: u8, + _padding: [u8; 3], +} + +impl ArroySetVectors { + fn embeddings_bytes<'a>(frame: &'a FrameGrantR<'_>) -> &'a [u8] { + let skip = EntryHeader::variant_size() + mem::size_of::(); + &frame[skip..] + } + + /// Read all the embeddings and write them into an aligned `f32` Vec. + pub fn read_all_embeddings_into_vec<'v>( + &self, + frame: &FrameGrantR<'_>, + vec: &'v mut Vec, + ) -> &'v [f32] { + let embeddings_bytes = Self::embeddings_bytes(frame); + let embeddings_count = embeddings_bytes.len() / mem::size_of::(); + vec.resize(embeddings_count, 0.0); + bytemuck::cast_slice_mut(vec.as_mut()).copy_from_slice(embeddings_bytes); + &vec[..] + } +} + +#[derive(Debug, Clone, Copy, NoUninit, CheckedBitPattern)] +#[repr(u16)] pub enum Database { Main, Documents, @@ -112,7 +355,7 @@ pub enum Database { FacetIdIsNullDocids, FacetIdIsEmptyDocids, FacetIdExistsDocids, - FacetIdF64NumberDocids, + FacetIdF64Docids, FacetIdStringDocids, FieldIdDocidFacetStrings, FieldIdDocidFacetF64s, @@ -133,18 +376,39 @@ impl Database { Database::FacetIdIsNullDocids => index.facet_id_is_null_docids.remap_types(), Database::FacetIdIsEmptyDocids => index.facet_id_is_empty_docids.remap_types(), Database::FacetIdExistsDocids => index.facet_id_exists_docids.remap_types(), - Database::FacetIdF64NumberDocids => index.facet_id_f64_docids.remap_types(), + Database::FacetIdF64Docids => index.facet_id_f64_docids.remap_types(), Database::FacetIdStringDocids => index.facet_id_string_docids.remap_types(), Database::FieldIdDocidFacetStrings => index.field_id_docid_facet_strings.remap_types(), Database::FieldIdDocidFacetF64s => index.field_id_docid_facet_f64s.remap_types(), } } + + pub fn database_name(&self) -> &'static str { + match self { + Database::Main => db_name::MAIN, + Database::Documents => db_name::DOCUMENTS, + Database::ExternalDocumentsIds => db_name::EXTERNAL_DOCUMENTS_IDS, + Database::ExactWordDocids => db_name::EXACT_WORD_DOCIDS, + Database::WordDocids => db_name::WORD_DOCIDS, + Database::WordFidDocids => db_name::WORD_FIELD_ID_DOCIDS, + Database::WordPositionDocids => db_name::WORD_POSITION_DOCIDS, + Database::FidWordCountDocids => db_name::FIELD_ID_WORD_COUNT_DOCIDS, + Database::WordPairProximityDocids => db_name::WORD_PAIR_PROXIMITY_DOCIDS, + Database::FacetIdIsNullDocids => db_name::FACET_ID_IS_NULL_DOCIDS, + Database::FacetIdIsEmptyDocids => db_name::FACET_ID_IS_EMPTY_DOCIDS, + Database::FacetIdExistsDocids => db_name::FACET_ID_EXISTS_DOCIDS, + Database::FacetIdF64Docids => db_name::FACET_ID_F64_DOCIDS, + Database::FacetIdStringDocids => db_name::FACET_ID_STRING_DOCIDS, + Database::FieldIdDocidFacetStrings => db_name::FIELD_ID_DOCID_FACET_STRINGS, + Database::FieldIdDocidFacetF64s => db_name::FIELD_ID_DOCID_FACET_F64S, + } + } } impl From for Database { fn from(value: FacetKind) -> Self { match value { - FacetKind::Number => Database::FacetIdF64NumberDocids, + FacetKind::Number => Database::FacetIdF64Docids, FacetKind::String => Database::FacetIdStringDocids, FacetKind::Null => Database::FacetIdIsNullDocids, FacetKind::Empty => Database::FacetIdIsEmptyDocids, @@ -153,100 +417,255 @@ impl From for Database { } } -impl DbOperation { - pub fn database(&self, index: &Index) -> heed::Database { - self.database.database(index) - } - - pub fn entry(self) -> EntryOperation { - self.entry - } -} - -pub struct WriterReceiver(Receiver); - -impl IntoIterator for WriterReceiver { - type Item = WriterOperation; - type IntoIter = IntoIter; - - fn into_iter(self) -> Self::IntoIter { - self.0.into_iter() - } -} - -pub struct ExtractorSender { - sender: Sender, - /// The number of message we sent in total in the channel. - send_count: AtomicUsize, - /// The number of times we sent something in a channel that was full. - writer_contentious_count: AtomicUsize, - /// The number of times we sent something in a channel that was empty. - extractor_contentious_count: AtomicUsize, -} - -impl Drop for ExtractorSender { - fn drop(&mut self) { - let send_count = *self.send_count.get_mut(); - let writer_contentious_count = *self.writer_contentious_count.get_mut(); - let extractor_contentious_count = *self.extractor_contentious_count.get_mut(); - tracing::debug!( - "Extractor channel stats: {send_count} sends, \ - {writer_contentious_count} writer contentions ({}%), \ - {extractor_contentious_count} extractor contentions ({}%)", - (writer_contentious_count as f32 / send_count as f32) * 100.0, - (extractor_contentious_count as f32 / send_count as f32) * 100.0 - ) - } -} - -impl ExtractorSender { - pub fn docids(&self) -> WordDocidsSender<'_, D> { +impl<'b> ExtractorBbqueueSender<'b> { + pub fn docids<'a, D: DatabaseType>(&'a self) -> WordDocidsSender<'a, 'b, D> { WordDocidsSender { sender: self, _marker: PhantomData } } - pub fn facet_docids(&self) -> FacetDocidsSender<'_> { + pub fn facet_docids<'a>(&'a self) -> FacetDocidsSender<'a, 'b> { FacetDocidsSender { sender: self } } - pub fn field_id_docid_facet_sender(&self) -> FieldIdDocidFacetSender<'_> { + pub fn field_id_docid_facet_sender<'a>(&'a self) -> FieldIdDocidFacetSender<'a, 'b> { FieldIdDocidFacetSender(self) } - pub fn documents(&self) -> DocumentsSender<'_> { + pub fn documents<'a>(&'a self) -> DocumentsSender<'a, 'b> { DocumentsSender(self) } - pub fn embeddings(&self) -> EmbeddingSender<'_> { - EmbeddingSender(&self.sender) + pub fn embeddings<'a>(&'a self) -> EmbeddingSender<'a, 'b> { + EmbeddingSender(self) } - pub fn geo(&self) -> GeoSender<'_> { - GeoSender(&self.sender) + pub fn geo<'a>(&'a self) -> GeoSender<'a, 'b> { + GeoSender(self) } - fn send_delete_vector(&self, docid: DocumentId) -> StdResult<(), SendError<()>> { - match self - .sender - .send(WriterOperation::ArroyOperation(ArroyOperation::DeleteVectors { docid })) - { - Ok(()) => Ok(()), - Err(SendError(_)) => Err(SendError(())), + fn delete_vector(&self, docid: DocumentId) -> crate::Result<()> { + let capacity = self.capacity; + let refcell = self.producers.get().unwrap(); + let mut producer = refcell.0.borrow_mut_or_yield(); + + let payload_header = EntryHeader::ArroyDeleteVector(ArroyDeleteVector { docid }); + let total_length = EntryHeader::total_delete_vector_size(); + if total_length > capacity { + panic!("The entry is larger ({total_length} bytes) than the BBQueue capacity ({capacity} bytes)"); } + + // Spin loop to have a frame the size we requested. + reserve_and_write_grant(&mut producer, total_length, &self.sender, |grant| { + payload_header.serialize_into(grant); + Ok(()) + })?; + + Ok(()) } - fn send_db_operation(&self, op: DbOperation) -> StdResult<(), SendError<()>> { - if self.sender.is_full() { - self.writer_contentious_count.fetch_add(1, Ordering::SeqCst); - } - if self.sender.is_empty() { - self.extractor_contentious_count.fetch_add(1, Ordering::SeqCst); + fn set_vectors( + &self, + docid: u32, + embedder_id: u8, + embeddings: &[Vec], + ) -> crate::Result<()> { + let capacity = self.capacity; + let refcell = self.producers.get().unwrap(); + let mut producer = refcell.0.borrow_mut_or_yield(); + + // If there are no vectors we specify the dimensions + // to zero to allocate no extra space at all + let dimensions = embeddings.first().map_or(0, |emb| emb.len()); + + let arroy_set_vector = ArroySetVectors { docid, embedder_id, _padding: [0; 3] }; + let payload_header = EntryHeader::ArroySetVectors(arroy_set_vector); + let total_length = EntryHeader::total_set_vectors_size(embeddings.len(), dimensions); + if total_length > capacity { + let mut value_file = tempfile::tempfile().map(BufWriter::new)?; + for embedding in embeddings { + let mut embedding_bytes = bytemuck::cast_slice(embedding); + io::copy(&mut embedding_bytes, &mut value_file)?; + } + + let value_file = value_file.into_inner().map_err(|ie| ie.into_error())?; + let embeddings = unsafe { Mmap::map(&value_file)? }; + + let large_vectors = LargeVectors { docid, embedder_id, embeddings }; + self.sender.send(ReceiverAction::LargeVectors(large_vectors)).unwrap(); + + return Ok(()); } - self.send_count.fetch_add(1, Ordering::SeqCst); - match self.sender.send(WriterOperation::DbOperation(op)) { - Ok(()) => Ok(()), - Err(SendError(_)) => Err(SendError(())), + // Spin loop to have a frame the size we requested. + reserve_and_write_grant(&mut producer, total_length, &self.sender, |grant| { + let header_size = payload_header.header_size(); + let (header_bytes, remaining) = grant.split_at_mut(header_size); + payload_header.serialize_into(header_bytes); + + if dimensions != 0 { + let output_iter = remaining.chunks_exact_mut(dimensions * mem::size_of::()); + for (embedding, output) in embeddings.iter().zip(output_iter) { + output.copy_from_slice(bytemuck::cast_slice(embedding)); + } + } + + Ok(()) + })?; + + Ok(()) + } + + fn write_key_value(&self, database: Database, key: &[u8], value: &[u8]) -> crate::Result<()> { + let key_length = key.len().try_into().ok().and_then(NonZeroU16::new).ok_or_else(|| { + InternalError::StorePut { + database_name: database.database_name(), + key: key.into(), + value_length: value.len(), + error: MdbError::BadValSize.into(), + } + })?; + self.write_key_value_with(database, key_length, value.len(), |key_buffer, value_buffer| { + key_buffer.copy_from_slice(key); + value_buffer.copy_from_slice(value); + Ok(()) + }) + } + + fn write_key_value_with( + &self, + database: Database, + key_length: NonZeroU16, + value_length: usize, + key_value_writer: F, + ) -> crate::Result<()> + where + F: FnOnce(&mut [u8], &mut [u8]) -> crate::Result<()>, + { + let capacity = self.capacity; + let refcell = self.producers.get().unwrap(); + let mut producer = refcell.0.borrow_mut_or_yield(); + + let operation = DbOperation { database, key_length: Some(key_length) }; + let payload_header = EntryHeader::DbOperation(operation); + let total_length = EntryHeader::total_key_value_size(key_length, value_length); + if total_length > capacity { + let mut key_buffer = vec![0; key_length.get() as usize].into_boxed_slice(); + let value_file = tempfile::tempfile()?; + value_file.set_len(value_length.try_into().unwrap())?; + let mut mmap_mut = unsafe { MmapMut::map_mut(&value_file)? }; + + key_value_writer(&mut key_buffer, &mut mmap_mut)?; + + self.sender + .send(ReceiverAction::LargeEntry(LargeEntry { + database, + key: key_buffer, + value: mmap_mut.make_read_only()?, + })) + .unwrap(); + + return Ok(()); } + + // Spin loop to have a frame the size we requested. + reserve_and_write_grant(&mut producer, total_length, &self.sender, |grant| { + let header_size = payload_header.header_size(); + let (header_bytes, remaining) = grant.split_at_mut(header_size); + payload_header.serialize_into(header_bytes); + let (key_buffer, value_buffer) = remaining.split_at_mut(key_length.get() as usize); + key_value_writer(key_buffer, value_buffer) + })?; + + Ok(()) + } + + fn delete_entry(&self, database: Database, key: &[u8]) -> crate::Result<()> { + let key_length = key.len().try_into().ok().and_then(NonZeroU16::new).ok_or_else(|| { + InternalError::StoreDeletion { + database_name: database.database_name(), + key: key.into(), + error: MdbError::BadValSize.into(), + } + })?; + self.delete_entry_with(database, key_length, |buffer| { + buffer.copy_from_slice(key); + Ok(()) + }) + } + + fn delete_entry_with( + &self, + database: Database, + key_length: NonZeroU16, + key_writer: F, + ) -> crate::Result<()> + where + F: FnOnce(&mut [u8]) -> crate::Result<()>, + { + let capacity = self.capacity; + let refcell = self.producers.get().unwrap(); + let mut producer = refcell.0.borrow_mut_or_yield(); + + // For deletion we do not specify the key length, + // it's in the remaining bytes. + let operation = DbOperation { database, key_length: None }; + let payload_header = EntryHeader::DbOperation(operation); + let total_length = EntryHeader::total_key_size(key_length); + if total_length > capacity { + panic!("The entry is larger ({total_length} bytes) than the BBQueue capacity ({capacity} bytes)"); + } + + // Spin loop to have a frame the size we requested. + reserve_and_write_grant(&mut producer, total_length, &self.sender, |grant| { + let header_size = payload_header.header_size(); + let (header_bytes, remaining) = grant.split_at_mut(header_size); + payload_header.serialize_into(header_bytes); + key_writer(remaining) + })?; + + Ok(()) + } +} + +/// Try to reserve a frame grant of `total_length` by spin +/// looping on the BBQueue buffer, panics if the receiver +/// has been disconnected or send a WakeUp message if necessary. +fn reserve_and_write_grant( + producer: &mut FrameProducer, + total_length: usize, + sender: &flume::Sender, + f: F, +) -> crate::Result<()> +where + F: FnOnce(&mut [u8]) -> crate::Result<()>, +{ + loop { + for _ in 0..10_000 { + match producer.grant(total_length) { + Ok(mut grant) => { + // We could commit only the used memory. + f(&mut grant)?; + grant.commit(total_length); + + // We only send a wake up message when the channel is empty + // so that we don't fill the channel with too many WakeUps. + if sender.is_empty() { + sender.send(ReceiverAction::WakeUp).unwrap(); + } + + return Ok(()); + } + Err(bbqueue::Error::InsufficientSize) => continue, + Err(e) => unreachable!("{e:?}"), + } + } + if sender.is_disconnected() { + return Err(Error::InternalError(InternalError::AbortedIndexation)); + } + + // We prefer to yield and allow the writing thread + // to do its job, especially beneficial when there + // is only one CPU core available. + std::thread::yield_now(); } } @@ -285,166 +704,160 @@ impl DatabaseType for WordPositionDocids { const DATABASE: Database = Database::WordPositionDocids; } -pub trait DocidsSender { - fn write(&self, key: &[u8], value: &[u8]) -> StdResult<(), SendError<()>>; - fn delete(&self, key: &[u8]) -> StdResult<(), SendError<()>>; -} - -pub struct WordDocidsSender<'a, D> { - sender: &'a ExtractorSender, +#[derive(Clone, Copy)] +pub struct WordDocidsSender<'a, 'b, D> { + sender: &'a ExtractorBbqueueSender<'b>, _marker: PhantomData, } -impl DocidsSender for WordDocidsSender<'_, D> { - fn write(&self, key: &[u8], value: &[u8]) -> StdResult<(), SendError<()>> { - let entry = EntryOperation::Write(KeyValueEntry::from_small_key_value(key, value)); - match self.sender.send_db_operation(DbOperation { database: D::DATABASE, entry }) { - Ok(()) => Ok(()), - Err(SendError(_)) => Err(SendError(())), - } - } - - fn delete(&self, key: &[u8]) -> StdResult<(), SendError<()>> { - let entry = EntryOperation::Delete(KeyEntry::from_key(key)); - match self.sender.send_db_operation(DbOperation { database: D::DATABASE, entry }) { - Ok(()) => Ok(()), - Err(SendError(_)) => Err(SendError(())), - } - } -} - -pub struct FacetDocidsSender<'a> { - sender: &'a ExtractorSender, -} - -impl DocidsSender for FacetDocidsSender<'_> { - fn write(&self, key: &[u8], value: &[u8]) -> StdResult<(), SendError<()>> { - let (facet_kind, key) = FacetKind::extract_from_key(key); - let database = Database::from(facet_kind); - let entry = match facet_kind { - // skip level group size - FacetKind::String | FacetKind::Number => { - // add facet group size - let value = [&[1], value].concat(); - EntryOperation::Write(KeyValueEntry::from_small_key_value(key, &value)) +impl WordDocidsSender<'_, '_, D> { + pub fn write(&self, key: &[u8], bitmap: &RoaringBitmap) -> crate::Result<()> { + let value_length = CboRoaringBitmapCodec::serialized_size(bitmap); + let key_length = key.len().try_into().ok().and_then(NonZeroU16::new).ok_or_else(|| { + InternalError::StorePut { + database_name: D::DATABASE.database_name(), + key: key.into(), + value_length, + error: MdbError::BadValSize.into(), } - _ => EntryOperation::Write(KeyValueEntry::from_small_key_value(key, value)), - }; - match self.sender.send_db_operation(DbOperation { database, entry }) { - Ok(()) => Ok(()), - Err(SendError(_)) => Err(SendError(())), - } + })?; + self.sender.write_key_value_with( + D::DATABASE, + key_length, + value_length, + |key_buffer, value_buffer| { + key_buffer.copy_from_slice(key); + CboRoaringBitmapCodec::serialize_into_writer(bitmap, value_buffer)?; + Ok(()) + }, + ) } - fn delete(&self, key: &[u8]) -> StdResult<(), SendError<()>> { + pub fn delete(&self, key: &[u8]) -> crate::Result<()> { + self.sender.delete_entry(D::DATABASE, key) + } +} + +#[derive(Clone, Copy)] +pub struct FacetDocidsSender<'a, 'b> { + sender: &'a ExtractorBbqueueSender<'b>, +} + +impl FacetDocidsSender<'_, '_> { + pub fn write(&self, key: &[u8], bitmap: &RoaringBitmap) -> crate::Result<()> { let (facet_kind, key) = FacetKind::extract_from_key(key); let database = Database::from(facet_kind); - let entry = EntryOperation::Delete(KeyEntry::from_key(key)); - match self.sender.send_db_operation(DbOperation { database, entry }) { - Ok(()) => Ok(()), - Err(SendError(_)) => Err(SendError(())), - } + + let value_length = CboRoaringBitmapCodec::serialized_size(bitmap); + let value_length = match facet_kind { + // We must take the facet group size into account + // when we serialize strings and numbers. + FacetKind::Number | FacetKind::String => value_length + 1, + FacetKind::Null | FacetKind::Empty | FacetKind::Exists => value_length, + }; + let key_length = key.len().try_into().ok().and_then(NonZeroU16::new).ok_or_else(|| { + InternalError::StorePut { + database_name: database.database_name(), + key: key.into(), + value_length, + error: MdbError::BadValSize.into(), + } + })?; + + self.sender.write_key_value_with( + database, + key_length, + value_length, + |key_out, value_out| { + key_out.copy_from_slice(key); + + let value_out = match facet_kind { + // We must take the facet group size into account + // when we serialize strings and numbers. + FacetKind::String | FacetKind::Number => { + let (first, remaining) = value_out.split_first_mut().unwrap(); + *first = 1; + remaining + } + FacetKind::Null | FacetKind::Empty | FacetKind::Exists => value_out, + }; + + CboRoaringBitmapCodec::serialize_into_writer(bitmap, value_out)?; + + Ok(()) + }, + ) + } + + pub fn delete(&self, key: &[u8]) -> crate::Result<()> { + let (facet_kind, key) = FacetKind::extract_from_key(key); + let database = Database::from(facet_kind); + self.sender.delete_entry(database, key) } } -pub struct FieldIdDocidFacetSender<'a>(&'a ExtractorSender); +#[derive(Clone, Copy)] +pub struct FieldIdDocidFacetSender<'a, 'b>(&'a ExtractorBbqueueSender<'b>); -impl FieldIdDocidFacetSender<'_> { - pub fn write_facet_string(&self, key: &[u8], value: &[u8]) -> StdResult<(), SendError<()>> { +impl FieldIdDocidFacetSender<'_, '_> { + pub fn write_facet_string(&self, key: &[u8], value: &[u8]) -> crate::Result<()> { debug_assert!(FieldDocIdFacetStringCodec::bytes_decode(key).is_ok()); - let entry = EntryOperation::Write(KeyValueEntry::from_small_key_value(key, value)); - self.0 - .send_db_operation(DbOperation { database: Database::FieldIdDocidFacetStrings, entry }) + self.0.write_key_value(Database::FieldIdDocidFacetStrings, key, value) } - pub fn write_facet_f64(&self, key: &[u8]) -> StdResult<(), SendError<()>> { + pub fn write_facet_f64(&self, key: &[u8]) -> crate::Result<()> { debug_assert!(FieldDocIdFacetF64Codec::bytes_decode(key).is_ok()); - let entry = EntryOperation::Write(KeyValueEntry::from_small_key_value(key, &[])); - self.0.send_db_operation(DbOperation { database: Database::FieldIdDocidFacetF64s, entry }) + self.0.write_key_value(Database::FieldIdDocidFacetF64s, key, &[]) } - pub fn delete_facet_string(&self, key: &[u8]) -> StdResult<(), SendError<()>> { + pub fn delete_facet_string(&self, key: &[u8]) -> crate::Result<()> { debug_assert!(FieldDocIdFacetStringCodec::bytes_decode(key).is_ok()); - let entry = EntryOperation::Delete(KeyEntry::from_key(key)); - self.0 - .send_db_operation(DbOperation { database: Database::FieldIdDocidFacetStrings, entry }) + self.0.delete_entry(Database::FieldIdDocidFacetStrings, key) } - pub fn delete_facet_f64(&self, key: &[u8]) -> StdResult<(), SendError<()>> { + pub fn delete_facet_f64(&self, key: &[u8]) -> crate::Result<()> { debug_assert!(FieldDocIdFacetF64Codec::bytes_decode(key).is_ok()); - let entry = EntryOperation::Delete(KeyEntry::from_key(key)); - self.0.send_db_operation(DbOperation { database: Database::FieldIdDocidFacetF64s, entry }) + self.0.delete_entry(Database::FieldIdDocidFacetF64s, key) } } -pub struct DocumentsSender<'a>(&'a ExtractorSender); +#[derive(Clone, Copy)] +pub struct DocumentsSender<'a, 'b>(&'a ExtractorBbqueueSender<'b>); -impl DocumentsSender<'_> { +impl DocumentsSender<'_, '_> { /// TODO do that efficiently pub fn uncompressed( &self, docid: DocumentId, external_id: String, document: &KvReaderFieldId, - ) -> StdResult<(), SendError<()>> { - let entry = EntryOperation::Write(KeyValueEntry::from_small_key_value( - &docid.to_be_bytes(), - document.as_bytes(), - )); - match self.0.send_db_operation(DbOperation { database: Database::Documents, entry }) { - Ok(()) => Ok(()), - Err(SendError(_)) => Err(SendError(())), - }?; - - let entry = EntryOperation::Write(KeyValueEntry::from_small_key_value( + ) -> crate::Result<()> { + self.0.write_key_value(Database::Documents, &docid.to_be_bytes(), document.as_bytes())?; + self.0.write_key_value( + Database::ExternalDocumentsIds, external_id.as_bytes(), &docid.to_be_bytes(), - )); - match self - .0 - .send_db_operation(DbOperation { database: Database::ExternalDocumentsIds, entry }) - { - Ok(()) => Ok(()), - Err(SendError(_)) => Err(SendError(())), - } + ) } - pub fn delete(&self, docid: DocumentId, external_id: String) -> StdResult<(), SendError<()>> { - let entry = EntryOperation::Delete(KeyEntry::from_key(&docid.to_be_bytes())); - match self.0.send_db_operation(DbOperation { database: Database::Documents, entry }) { - Ok(()) => Ok(()), - Err(SendError(_)) => Err(SendError(())), - }?; - - self.0.send_delete_vector(docid)?; - - let entry = EntryOperation::Delete(KeyEntry::from_key(external_id.as_bytes())); - match self - .0 - .send_db_operation(DbOperation { database: Database::ExternalDocumentsIds, entry }) - { - Ok(()) => Ok(()), - Err(SendError(_)) => Err(SendError(())), - } + pub fn delete(&self, docid: DocumentId, external_id: String) -> crate::Result<()> { + self.0.delete_entry(Database::Documents, &docid.to_be_bytes())?; + self.0.delete_vector(docid)?; + self.0.delete_entry(Database::ExternalDocumentsIds, external_id.as_bytes()) } } -pub struct EmbeddingSender<'a>(&'a Sender); +#[derive(Clone, Copy)] +pub struct EmbeddingSender<'a, 'b>(&'a ExtractorBbqueueSender<'b>); -impl EmbeddingSender<'_> { +impl EmbeddingSender<'_, '_> { pub fn set_vectors( &self, docid: DocumentId, embedder_id: u8, embeddings: Vec, - ) -> StdResult<(), SendError<()>> { - self.0 - .send(WriterOperation::ArroyOperation(ArroyOperation::SetVectors { - docid, - embedder_id, - embeddings, - })) - .map_err(|_| SendError(())) + ) -> crate::Result<()> { + self.0.set_vectors(docid, embedder_id, &embeddings[..]) } pub fn set_vector( @@ -452,51 +865,48 @@ impl EmbeddingSender<'_> { docid: DocumentId, embedder_id: u8, embedding: Embedding, - ) -> StdResult<(), SendError<()>> { - self.0 - .send(WriterOperation::ArroyOperation(ArroyOperation::SetVector { - docid, - embedder_id, - embedding, - })) - .map_err(|_| SendError(())) - } - - /// Marks all embedders as "to be built" - pub fn finish(self, configs: Vec) -> StdResult<(), SendError<()>> { - self.0 - .send(WriterOperation::ArroyOperation(ArroyOperation::Finish { configs })) - .map_err(|_| SendError(())) + ) -> crate::Result<()> { + self.0.set_vectors(docid, embedder_id, &[embedding]) } } -pub struct GeoSender<'a>(&'a Sender); +#[derive(Clone, Copy)] +pub struct GeoSender<'a, 'b>(&'a ExtractorBbqueueSender<'b>); -impl GeoSender<'_> { +impl GeoSender<'_, '_> { pub fn set_rtree(&self, value: Mmap) -> StdResult<(), SendError<()>> { self.0 - .send(WriterOperation::DbOperation(DbOperation { + .sender + .send(ReceiverAction::LargeEntry(LargeEntry { database: Database::Main, - entry: EntryOperation::Write(KeyValueEntry::from_large_key_value( - GEO_RTREE_KEY.as_bytes(), - value, - )), + key: GEO_RTREE_KEY.to_string().into_bytes().into_boxed_slice(), + value, })) .map_err(|_| SendError(())) } - pub fn set_geo_faceted(&self, bitmap: &RoaringBitmap) -> StdResult<(), SendError<()>> { - let mut buffer = Vec::new(); - bitmap.serialize_into(&mut buffer).unwrap(); + pub fn set_geo_faceted(&self, bitmap: &RoaringBitmap) -> crate::Result<()> { + let database = Database::Main; + let value_length = bitmap.serialized_size(); + let key = GEO_FACETED_DOCUMENTS_IDS_KEY.as_bytes(); + let key_length = key.len().try_into().ok().and_then(NonZeroU16::new).ok_or_else(|| { + InternalError::StorePut { + database_name: database.database_name(), + key: key.into(), + value_length, + error: MdbError::BadValSize.into(), + } + })?; - self.0 - .send(WriterOperation::DbOperation(DbOperation { - database: Database::Main, - entry: EntryOperation::Write(KeyValueEntry::from_small_key_value( - GEO_FACETED_DOCUMENTS_IDS_KEY.as_bytes(), - &buffer, - )), - })) - .map_err(|_| SendError(())) + self.0.write_key_value_with( + database, + key_length, + value_length, + |key_buffer, value_buffer| { + key_buffer.copy_from_slice(key); + bitmap.serialize_into(value_buffer)?; + Ok(()) + }, + ) } } diff --git a/crates/milli/src/update/new/document.rs b/crates/milli/src/update/new/document.rs index b1a2218f2..930b0c078 100644 --- a/crates/milli/src/update/new/document.rs +++ b/crates/milli/src/update/new/document.rs @@ -1,7 +1,8 @@ use std::collections::{BTreeMap, BTreeSet}; +use bumparaw_collections::RawMap; use heed::RoTxn; -use raw_collections::RawMap; +use rustc_hash::FxBuildHasher; use serde_json::value::RawValue; use super::vector_document::VectorDocument; @@ -385,12 +386,12 @@ pub type Entry<'doc> = (&'doc str, &'doc RawValue); #[derive(Debug)] pub struct Versions<'doc> { - data: RawMap<'doc>, + data: RawMap<'doc, FxBuildHasher>, } impl<'doc> Versions<'doc> { pub fn multiple( - mut versions: impl Iterator>>, + mut versions: impl Iterator>>, ) -> Result> { let Some(data) = versions.next() else { return Ok(None) }; let mut data = data?; @@ -403,7 +404,7 @@ impl<'doc> Versions<'doc> { Ok(Some(Self::single(data))) } - pub fn single(version: RawMap<'doc>) -> Self { + pub fn single(version: RawMap<'doc, FxBuildHasher>) -> Self { Self { data: version } } diff --git a/crates/milli/src/update/new/document_change.rs b/crates/milli/src/update/new/document_change.rs index 899655db1..1644b2254 100644 --- a/crates/milli/src/update/new/document_change.rs +++ b/crates/milli/src/update/new/document_change.rs @@ -1,7 +1,10 @@ use bumpalo::Bump; use heed::RoTxn; -use super::document::{DocumentFromDb, DocumentFromVersions, MergedDocument, Versions}; +use super::document::{ + Document as _, DocumentFromDb, DocumentFromVersions, MergedDocument, Versions, +}; +use super::extract::perm_json_p; use super::vector_document::{ MergedVectorDocument, VectorDocumentFromDb, VectorDocumentFromVersions, }; @@ -164,6 +167,80 @@ impl<'doc> Update<'doc> { } } + /// Returns whether the updated version of the document is different from the current version for the passed subset of fields. + /// + /// `true` if at least one top-level-field that is a exactly a member of field or a parent of a member of field changed. + /// Otherwise `false`. + pub fn has_changed_for_fields<'t, Mapper: FieldIdMapper>( + &self, + fields: Option<&[&str]>, + rtxn: &'t RoTxn, + index: &'t Index, + mapper: &'t Mapper, + ) -> Result { + let mut changed = false; + let mut cached_current = None; + let mut updated_selected_field_count = 0; + + for entry in self.updated().iter_top_level_fields() { + let (key, updated_value) = entry?; + + if perm_json_p::select_field(key, fields, &[]) == perm_json_p::Selection::Skip { + continue; + } + + updated_selected_field_count += 1; + let current = match cached_current { + Some(current) => current, + None => self.current(rtxn, index, mapper)?, + }; + let current_value = current.top_level_field(key)?; + let Some(current_value) = current_value else { + changed = true; + break; + }; + + if current_value.get() != updated_value.get() { + changed = true; + break; + } + cached_current = Some(current); + } + + if !self.has_deletion { + // no field deletion, so fields that don't appear in `updated` cannot have changed + return Ok(changed); + } + + if changed { + return Ok(true); + } + + // we saw all updated fields, and set `changed` if any field wasn't in `current`. + // so if there are as many fields in `current` as in `updated`, then nothing changed. + // If there is any more fields in `current`, then they are missing in `updated`. + let has_deleted_fields = { + let current = match cached_current { + Some(current) => current, + None => self.current(rtxn, index, mapper)?, + }; + + let mut current_selected_field_count = 0; + for entry in current.iter_top_level_fields() { + let (key, _) = entry?; + + if perm_json_p::select_field(key, fields, &[]) == perm_json_p::Selection::Skip { + continue; + } + current_selected_field_count += 1; + } + + current_selected_field_count != updated_selected_field_count + }; + + Ok(has_deleted_fields) + } + pub fn updated_vectors( &self, doc_alloc: &'doc Bump, diff --git a/crates/milli/src/update/new/extract/cache.rs b/crates/milli/src/update/new/extract/cache.rs index 26ed0eb44..e2c8bb5fe 100644 --- a/crates/milli/src/update/new/extract/cache.rs +++ b/crates/milli/src/update/new/extract/cache.rs @@ -69,12 +69,12 @@ use std::io::BufReader; use std::{io, iter, mem}; use bumpalo::Bump; +use bumparaw_collections::bbbul::{BitPacker, BitPacker4x}; +use bumparaw_collections::map::FrozenMap; +use bumparaw_collections::{Bbbul, FrozenBbbul}; use grenad::ReaderCursor; use hashbrown::hash_map::RawEntryMut; use hashbrown::HashMap; -use raw_collections::bbbul::{BitPacker, BitPacker4x}; -use raw_collections::map::FrozenMap; -use raw_collections::{Bbbul, FrozenBbbul}; use roaring::RoaringBitmap; use rustc_hash::FxBuildHasher; @@ -177,12 +177,12 @@ impl<'extractor> BalancedCaches<'extractor> { Ok(()) } - pub fn freeze(&mut self) -> Result>> { + pub fn freeze(&mut self, source_id: usize) -> Result>> { match &mut self.caches { InnerCaches::Normal(NormalCaches { caches }) => caches .iter_mut() .enumerate() - .map(|(bucket, map)| { + .map(|(bucket_id, map)| { // safety: we are transmuting the Bbbul into a FrozenBbbul // that are the same size. let map = unsafe { @@ -201,14 +201,19 @@ impl<'extractor> BalancedCaches<'extractor> { >, >(map) }; - Ok(FrozenCache { bucket, cache: FrozenMap::new(map), spilled: Vec::new() }) + Ok(FrozenCache { + source_id, + bucket_id, + cache: FrozenMap::new(map), + spilled: Vec::new(), + }) }) .collect(), InnerCaches::Spilling(SpillingCaches { caches, spilled_entries, .. }) => caches .iter_mut() .zip(mem::take(spilled_entries)) .enumerate() - .map(|(bucket, (map, sorter))| { + .map(|(bucket_id, (map, sorter))| { let spilled = sorter .into_reader_cursors()? .into_iter() @@ -234,7 +239,7 @@ impl<'extractor> BalancedCaches<'extractor> { >, >(map) }; - Ok(FrozenCache { bucket, cache: FrozenMap::new(map), spilled }) + Ok(FrozenCache { source_id, bucket_id, cache: FrozenMap::new(map), spilled }) }) .collect(), } @@ -415,21 +420,21 @@ fn spill_entry_to_sorter( match deladd { DelAddRoaringBitmap { del: Some(del), add: None } => { cbo_buffer.clear(); - CboRoaringBitmapCodec::serialize_into(&del, cbo_buffer); + CboRoaringBitmapCodec::serialize_into_vec(&del, cbo_buffer); value_writer.insert(DelAdd::Deletion, &cbo_buffer)?; } DelAddRoaringBitmap { del: None, add: Some(add) } => { cbo_buffer.clear(); - CboRoaringBitmapCodec::serialize_into(&add, cbo_buffer); + CboRoaringBitmapCodec::serialize_into_vec(&add, cbo_buffer); value_writer.insert(DelAdd::Addition, &cbo_buffer)?; } DelAddRoaringBitmap { del: Some(del), add: Some(add) } => { cbo_buffer.clear(); - CboRoaringBitmapCodec::serialize_into(&del, cbo_buffer); + CboRoaringBitmapCodec::serialize_into_vec(&del, cbo_buffer); value_writer.insert(DelAdd::Deletion, &cbo_buffer)?; cbo_buffer.clear(); - CboRoaringBitmapCodec::serialize_into(&add, cbo_buffer); + CboRoaringBitmapCodec::serialize_into_vec(&add, cbo_buffer); value_writer.insert(DelAdd::Addition, &cbo_buffer)?; } DelAddRoaringBitmap { del: None, add: None } => return Ok(()), @@ -440,7 +445,8 @@ fn spill_entry_to_sorter( } pub struct FrozenCache<'a, 'extractor> { - bucket: usize, + bucket_id: usize, + source_id: usize, cache: FrozenMap< 'a, 'extractor, @@ -457,40 +463,36 @@ pub fn transpose_and_freeze_caches<'a, 'extractor>( let width = caches.first().map(BalancedCaches::buckets).unwrap_or(0); let mut bucket_caches: Vec<_> = iter::repeat_with(Vec::new).take(width).collect(); - for thread_cache in caches { - for frozen in thread_cache.freeze()? { - bucket_caches[frozen.bucket].push(frozen); + for (thread_index, thread_cache) in caches.iter_mut().enumerate() { + for frozen in thread_cache.freeze(thread_index)? { + bucket_caches[frozen.bucket_id].push(frozen); } } Ok(bucket_caches) } -/// Merges the caches that must be all associated to the same bucket. +/// Merges the caches that must be all associated to the same bucket +/// but make sure to sort the different buckets before performing the merges. /// /// # Panics /// /// - If the bucket IDs in these frozen caches are not exactly the same. -pub fn merge_caches(frozen: Vec, mut f: F) -> Result<()> +pub fn merge_caches_sorted(frozen: Vec, mut f: F) -> Result<()> where F: for<'a> FnMut(&'a [u8], DelAddRoaringBitmap) -> Result<()>, { let mut maps = Vec::new(); - let mut readers = Vec::new(); - let mut current_bucket = None; - for FrozenCache { bucket, cache, ref mut spilled } in frozen { - assert_eq!(*current_bucket.get_or_insert(bucket), bucket); - maps.push(cache); - readers.append(spilled); - } - - // First manage the spilled entries by looking into the HashMaps, - // merge them and mark them as dummy. let mut heap = BinaryHeap::new(); - for (source_index, source) in readers.into_iter().enumerate() { - let mut cursor = source.into_cursor()?; - if cursor.move_on_next()?.is_some() { - heap.push(Entry { cursor, source_index }); + let mut current_bucket = None; + for FrozenCache { source_id, bucket_id, cache, spilled } in frozen { + assert_eq!(*current_bucket.get_or_insert(bucket_id), bucket_id); + maps.push((source_id, cache)); + for reader in spilled { + let mut cursor = reader.into_cursor()?; + if cursor.move_on_next()?.is_some() { + heap.push(Entry { cursor, source_id }); + } } } @@ -507,25 +509,29 @@ where let mut output = DelAddRoaringBitmap::from_bytes(first_value)?; while let Some(mut entry) = heap.peek_mut() { - if let Some((key, _value)) = entry.cursor.current() { - if first_key == key { - let new = DelAddRoaringBitmap::from_bytes(first_value)?; - output = output.merge(new); - // When we are done we the current value of this entry move make - // it move forward and let the heap reorganize itself (on drop) - if entry.cursor.move_on_next()?.is_none() { - PeekMut::pop(entry); - } - } else { + if let Some((key, value)) = entry.cursor.current() { + if first_key != key { break; } + + let new = DelAddRoaringBitmap::from_bytes(value)?; + output = output.merge(new); + // When we are done we the current value of this entry move make + // it move forward and let the heap reorganize itself (on drop) + if entry.cursor.move_on_next()?.is_none() { + PeekMut::pop(entry); + } } } // Once we merged all of the spilled bitmaps we must also // fetch the entries from the non-spilled entries (the HashMaps). - for (map_index, map) in maps.iter_mut().enumerate() { - if first_entry.source_index != map_index { + for (source_id, map) in maps.iter_mut() { + debug_assert!( + !(map.get(first_key).is_some() && first_entry.source_id == *source_id), + "A thread should not have spiled a key that has been inserted in the cache" + ); + if first_entry.source_id != *source_id { if let Some(new) = map.get_mut(first_key) { output.union_and_clear_bbbul(new); } @@ -537,22 +543,22 @@ where // Don't forget to put the first entry back into the heap. if first_entry.cursor.move_on_next()?.is_some() { - heap.push(first_entry) + heap.push(first_entry); } } // Then manage the content on the HashMap entries that weren't taken (mem::take). - while let Some(mut map) = maps.pop() { - for (key, bbbul) in map.iter_mut() { - // Make sure we don't try to work with entries already managed by the spilled - if bbbul.is_empty() { - continue; - } + while let Some((_, mut map)) = maps.pop() { + // Make sure we don't try to work with entries already managed by the spilled + let mut ordered_entries: Vec<_> = + map.iter_mut().filter(|(_, bbbul)| !bbbul.is_empty()).collect(); + ordered_entries.sort_unstable_by_key(|(key, _)| *key); + for (key, bbbul) in ordered_entries { let mut output = DelAddRoaringBitmap::empty(); output.union_and_clear_bbbul(bbbul); - for rhs in maps.iter_mut() { + for (_, rhs) in maps.iter_mut() { if let Some(new) = rhs.get_mut(key) { output.union_and_clear_bbbul(new); } @@ -568,14 +574,14 @@ where struct Entry { cursor: ReaderCursor, - source_index: usize, + source_id: usize, } impl Ord for Entry { fn cmp(&self, other: &Entry) -> Ordering { let skey = self.cursor.current().map(|(k, _)| k); let okey = other.cursor.current().map(|(k, _)| k); - skey.cmp(&okey).then(self.source_index.cmp(&other.source_index)).reverse() + skey.cmp(&okey).then(self.source_id.cmp(&other.source_id)).reverse() } } diff --git a/crates/milli/src/update/new/extract/documents.rs b/crates/milli/src/update/new/extract/documents.rs index aeb1d5694..13307025a 100644 --- a/crates/milli/src/update/new/extract/documents.rs +++ b/crates/milli/src/update/new/extract/documents.rs @@ -12,13 +12,14 @@ use crate::update::new::thread_local::FullySend; use crate::update::new::DocumentChange; use crate::vector::EmbeddingConfigs; use crate::Result; -pub struct DocumentsExtractor<'a> { - document_sender: &'a DocumentsSender<'a>, + +pub struct DocumentsExtractor<'a, 'b> { + document_sender: DocumentsSender<'a, 'b>, embedders: &'a EmbeddingConfigs, } -impl<'a> DocumentsExtractor<'a> { - pub fn new(document_sender: &'a DocumentsSender<'a>, embedders: &'a EmbeddingConfigs) -> Self { +impl<'a, 'b> DocumentsExtractor<'a, 'b> { + pub fn new(document_sender: DocumentsSender<'a, 'b>, embedders: &'a EmbeddingConfigs) -> Self { Self { document_sender, embedders } } } @@ -29,7 +30,7 @@ pub struct DocumentExtractorData { pub field_distribution_delta: HashMap, } -impl<'a, 'extractor> Extractor<'extractor> for DocumentsExtractor<'a> { +impl<'a, 'b, 'extractor> Extractor<'extractor> for DocumentsExtractor<'a, 'b> { type Data = FullySend>; fn init_data(&self, _extractor_alloc: &'extractor Bump) -> Result { diff --git a/crates/milli/src/update/new/extract/faceted/extract_facets.rs b/crates/milli/src/update/new/extract/faceted/extract_facets.rs index 9ad37d52c..66ed6cbfb 100644 --- a/crates/milli/src/update/new/extract/faceted/extract_facets.rs +++ b/crates/milli/src/update/new/extract/faceted/extract_facets.rs @@ -16,23 +16,23 @@ use crate::update::del_add::DelAdd; use crate::update::new::channel::FieldIdDocidFacetSender; use crate::update::new::extract::perm_json_p; use crate::update::new::indexer::document_changes::{ - extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext, Progress, + extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext, }; use crate::update::new::ref_cell_ext::RefCellExt as _; -use crate::update::new::steps::Step; +use crate::update::new::steps::IndexingStep; use crate::update::new::thread_local::{FullySend, ThreadLocal}; use crate::update::new::DocumentChange; use crate::update::GrenadParameters; use crate::{DocumentId, FieldId, Index, Result, MAX_FACET_VALUE_LENGTH}; -pub struct FacetedExtractorData<'a> { +pub struct FacetedExtractorData<'a, 'b> { attributes_to_extract: &'a [&'a str], - sender: &'a FieldIdDocidFacetSender<'a>, + sender: &'a FieldIdDocidFacetSender<'a, 'b>, grenad_parameters: GrenadParameters, buckets: usize, } -impl<'a, 'extractor> Extractor<'extractor> for FacetedExtractorData<'a> { +impl<'a, 'b, 'extractor> Extractor<'extractor> for FacetedExtractorData<'a, 'b> { type Data = RefCell>; fn init_data(&self, extractor_alloc: &'extractor Bump) -> Result { @@ -97,6 +97,15 @@ impl FacetedDocidsExtractor { }, ), DocumentChange::Update(inner) => { + if !inner.has_changed_for_fields( + Some(attributes_to_extract), + rtxn, + index, + context.db_fields_ids_map, + )? { + return Ok(()); + } + extract_document_facets( attributes_to_extract, inner.current(rtxn, index, context.db_fields_ids_map)?, @@ -318,7 +327,7 @@ impl<'doc> DelAddFacetValue<'doc> { docid: DocumentId, sender: &FieldIdDocidFacetSender, doc_alloc: &Bump, - ) -> std::result::Result<(), crossbeam_channel::SendError<()>> { + ) -> crate::Result<()> { let mut buffer = bumpalo::collections::Vec::new_in(doc_alloc); for ((fid, value), deladd) in self.strings { if let Ok(s) = std::str::from_utf8(&value) { @@ -364,26 +373,16 @@ fn truncate_str(s: &str) -> &str { impl FacetedDocidsExtractor { #[tracing::instrument(level = "trace", skip_all, target = "indexing::extract::faceted")] - pub fn run_extraction< - 'pl, - 'fid, - 'indexer, - 'index, - 'extractor, - DC: DocumentChanges<'pl>, - MSP, - SP, - >( + pub fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP>( grenad_parameters: GrenadParameters, document_changes: &DC, - indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP, SP>, + indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>, extractor_allocs: &'extractor mut ThreadLocal>, sender: &FieldIdDocidFacetSender, - step: Step, + step: IndexingStep, ) -> Result>> where MSP: Fn() -> bool + Sync, - SP: Fn(Progress) + Sync, { let index = indexing_context.index; let rtxn = index.read_txn()?; diff --git a/crates/milli/src/update/new/extract/geo/mod.rs b/crates/milli/src/update/new/extract/geo/mod.rs index 09d2ce0f8..a3820609d 100644 --- a/crates/milli/src/update/new/extract/geo/mod.rs +++ b/crates/milli/src/update/new/extract/geo/mod.rs @@ -1,6 +1,6 @@ use std::cell::RefCell; use std::fs::File; -use std::io::{self, BufReader, BufWriter, ErrorKind, Read, Write as _}; +use std::io::{self, BufReader, BufWriter, ErrorKind, Read, Seek as _, Write as _}; use std::{iter, mem, result}; use bumpalo::Bump; @@ -97,30 +97,34 @@ pub struct FrozenGeoExtractorData<'extractor> { impl<'extractor> FrozenGeoExtractorData<'extractor> { pub fn iter_and_clear_removed( &mut self, - ) -> impl IntoIterator> + '_ { - mem::take(&mut self.removed) + ) -> io::Result> + '_> { + Ok(mem::take(&mut self.removed) .iter() .copied() .map(Ok) - .chain(iterator_over_spilled_geopoints(&mut self.spilled_removed)) + .chain(iterator_over_spilled_geopoints(&mut self.spilled_removed)?)) } pub fn iter_and_clear_inserted( &mut self, - ) -> impl IntoIterator> + '_ { - mem::take(&mut self.inserted) + ) -> io::Result> + '_> { + Ok(mem::take(&mut self.inserted) .iter() .copied() .map(Ok) - .chain(iterator_over_spilled_geopoints(&mut self.spilled_inserted)) + .chain(iterator_over_spilled_geopoints(&mut self.spilled_inserted)?)) } } fn iterator_over_spilled_geopoints( spilled: &mut Option>, -) -> impl IntoIterator> + '_ { +) -> io::Result> + '_> { let mut spilled = spilled.take(); - iter::from_fn(move || match &mut spilled { + if let Some(spilled) = &mut spilled { + spilled.rewind()?; + } + + Ok(iter::from_fn(move || match &mut spilled { Some(file) => { let geopoint_bytes = &mut [0u8; mem::size_of::()]; match file.read_exact(geopoint_bytes) { @@ -130,7 +134,7 @@ fn iterator_over_spilled_geopoints( } } None => None, - }) + })) } impl<'extractor> Extractor<'extractor> for GeoExtractor { @@ -157,7 +161,9 @@ impl<'extractor> Extractor<'extractor> for GeoExtractor { let mut data_ref = context.data.borrow_mut_or_yield(); for change in changes { - if max_memory.map_or(false, |mm| context.extractor_alloc.allocated_bytes() >= mm) { + if data_ref.spilled_removed.is_none() + && max_memory.map_or(false, |mm| context.extractor_alloc.allocated_bytes() >= mm) + { // We must spill as we allocated too much memory data_ref.spilled_removed = tempfile::tempfile().map(BufWriter::new).map(Some)?; data_ref.spilled_inserted = tempfile::tempfile().map(BufWriter::new).map(Some)?; diff --git a/crates/milli/src/update/new/extract/mod.rs b/crates/milli/src/update/new/extract/mod.rs index e67f70db1..4bcb918e4 100644 --- a/crates/milli/src/update/new/extract/mod.rs +++ b/crates/milli/src/update/new/extract/mod.rs @@ -6,30 +6,31 @@ mod searchable; mod vectors; use bumpalo::Bump; -pub use cache::{merge_caches, transpose_and_freeze_caches, BalancedCaches, DelAddRoaringBitmap}; +pub use cache::{ + merge_caches_sorted, transpose_and_freeze_caches, BalancedCaches, DelAddRoaringBitmap, +}; pub use documents::*; pub use faceted::*; pub use geo::*; pub use searchable::*; pub use vectors::EmbeddingExtractor; -use super::indexer::document_changes::{DocumentChanges, IndexingContext, Progress}; -use super::steps::Step; +use super::indexer::document_changes::{DocumentChanges, IndexingContext}; +use super::steps::IndexingStep; use super::thread_local::{FullySend, ThreadLocal}; use crate::update::GrenadParameters; use crate::Result; pub trait DocidsExtractor { - fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP, SP>( + fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP>( grenad_parameters: GrenadParameters, document_changes: &DC, - indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP, SP>, + indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>, extractor_allocs: &'extractor mut ThreadLocal>, - step: Step, + step: IndexingStep, ) -> Result>> where - MSP: Fn() -> bool + Sync, - SP: Fn(Progress) + Sync; + MSP: Fn() -> bool + Sync; } /// TODO move in permissive json pointer diff --git a/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs b/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs index 05e2374dc..952ee91e4 100644 --- a/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs +++ b/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs @@ -11,10 +11,10 @@ use super::tokenize_document::{tokenizer_builder, DocumentTokenizer}; use crate::update::new::extract::cache::BalancedCaches; use crate::update::new::extract::perm_json_p::contained_in; use crate::update::new::indexer::document_changes::{ - extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext, Progress, + extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext, }; use crate::update::new::ref_cell_ext::RefCellExt as _; -use crate::update::new::steps::Step; +use crate::update::new::steps::IndexingStep; use crate::update::new::thread_local::{FullySend, MostlySend, ThreadLocal}; use crate::update::new::DocumentChange; use crate::update::GrenadParameters; @@ -28,7 +28,7 @@ pub struct WordDocidsBalancedCaches<'extractor> { exact_word_docids: BalancedCaches<'extractor>, word_position_docids: BalancedCaches<'extractor>, fid_word_count_docids: BalancedCaches<'extractor>, - fid_word_count: HashMap, + fid_word_count: HashMap, Option)>, current_docid: Option, } @@ -85,8 +85,8 @@ impl<'extractor> WordDocidsBalancedCaches<'extractor> { self.fid_word_count .entry(field_id) - .and_modify(|(_current_count, new_count)| *new_count += 1) - .or_insert((0, 1)); + .and_modify(|(_current_count, new_count)| *new_count.get_or_insert(0) += 1) + .or_insert((None, Some(1))); self.current_docid = Some(docid); Ok(()) @@ -130,8 +130,8 @@ impl<'extractor> WordDocidsBalancedCaches<'extractor> { self.fid_word_count .entry(field_id) - .and_modify(|(current_count, _new_count)| *current_count += 1) - .or_insert((1, 0)); + .and_modify(|(current_count, _new_count)| *current_count.get_or_insert(0) += 1) + .or_insert((Some(1), None)); self.current_docid = Some(docid); @@ -141,14 +141,18 @@ impl<'extractor> WordDocidsBalancedCaches<'extractor> { fn flush_fid_word_count(&mut self, buffer: &mut BumpVec) -> Result<()> { for (fid, (current_count, new_count)) in self.fid_word_count.drain() { if current_count != new_count { - if current_count <= MAX_COUNTED_WORDS { + if let Some(current_count) = + current_count.filter(|current_count| *current_count <= MAX_COUNTED_WORDS) + { buffer.clear(); buffer.extend_from_slice(&fid.to_be_bytes()); buffer.push(current_count as u8); self.fid_word_count_docids .insert_del_u32(buffer, self.current_docid.unwrap())?; } - if new_count <= MAX_COUNTED_WORDS { + if let Some(new_count) = + new_count.filter(|new_count| *new_count <= MAX_COUNTED_WORDS) + { buffer.clear(); buffer.extend_from_slice(&fid.to_be_bytes()); buffer.push(new_count as u8); @@ -235,25 +239,15 @@ impl<'a, 'extractor> Extractor<'extractor> for WordDocidsExtractorData<'a> { pub struct WordDocidsExtractors; impl WordDocidsExtractors { - pub fn run_extraction< - 'pl, - 'fid, - 'indexer, - 'index, - 'extractor, - DC: DocumentChanges<'pl>, - MSP, - SP, - >( + pub fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP>( grenad_parameters: GrenadParameters, document_changes: &DC, - indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP, SP>, + indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>, extractor_allocs: &'extractor mut ThreadLocal>, - step: Step, + step: IndexingStep, ) -> Result> where MSP: Fn() -> bool + Sync, - SP: Fn(Progress) + Sync, { let index = indexing_context.index; let rtxn = index.read_txn()?; @@ -351,6 +345,15 @@ impl WordDocidsExtractors { )?; } DocumentChange::Update(inner) => { + if !inner.has_changed_for_fields( + document_tokenizer.attribute_to_extract, + &context.rtxn, + context.index, + context.db_fields_ids_map, + )? { + return Ok(()); + } + let mut token_fn = |fname: &str, fid, pos, word: &str| { cached_sorter.insert_del_u32( fid, diff --git a/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs b/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs index dcd9e3a78..e58c0efd2 100644 --- a/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs +++ b/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs @@ -70,6 +70,15 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor { )?; } DocumentChange::Update(inner) => { + if !inner.has_changed_for_fields( + document_tokenizer.attribute_to_extract, + rtxn, + index, + context.db_fields_ids_map, + )? { + return Ok(()); + } + let document = inner.current(rtxn, index, context.db_fields_ids_map)?; process_document_tokens( document, diff --git a/crates/milli/src/update/new/extract/searchable/mod.rs b/crates/milli/src/update/new/extract/searchable/mod.rs index 05d2406d9..c4240196a 100644 --- a/crates/milli/src/update/new/extract/searchable/mod.rs +++ b/crates/milli/src/update/new/extract/searchable/mod.rs @@ -14,9 +14,9 @@ use tokenize_document::{tokenizer_builder, DocumentTokenizer}; use super::cache::BalancedCaches; use super::DocidsExtractor; use crate::update::new::indexer::document_changes::{ - extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext, Progress, + extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext, }; -use crate::update::new::steps::Step; +use crate::update::new::steps::IndexingStep; use crate::update::new::thread_local::{FullySend, ThreadLocal}; use crate::update::new::DocumentChange; use crate::update::GrenadParameters; @@ -56,16 +56,15 @@ impl<'a, 'extractor, EX: SearchableExtractor + Sync> Extractor<'extractor> } pub trait SearchableExtractor: Sized + Sync { - fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP, SP>( + fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP>( grenad_parameters: GrenadParameters, document_changes: &DC, - indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP, SP>, + indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>, extractor_allocs: &'extractor mut ThreadLocal>, - step: Step, + step: IndexingStep, ) -> Result>> where MSP: Fn() -> bool + Sync, - SP: Fn(Progress) + Sync, { let rtxn = indexing_context.index.read_txn()?; let stop_words = indexing_context.index.stop_words(&rtxn)?; @@ -134,16 +133,15 @@ pub trait SearchableExtractor: Sized + Sync { } impl DocidsExtractor for T { - fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP, SP>( + fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP>( grenad_parameters: GrenadParameters, document_changes: &DC, - indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP, SP>, + indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>, extractor_allocs: &'extractor mut ThreadLocal>, - step: Step, + step: IndexingStep, ) -> Result>> where MSP: Fn() -> bool + Sync, - SP: Fn(Progress) + Sync, { Self::run_extraction( grenad_parameters, diff --git a/crates/milli/src/update/new/extract/searchable/tokenize_document.rs b/crates/milli/src/update/new/extract/searchable/tokenize_document.rs index ffdce5b7e..1c1605b66 100644 --- a/crates/milli/src/update/new/extract/searchable/tokenize_document.rs +++ b/crates/milli/src/update/new/extract/searchable/tokenize_document.rs @@ -176,9 +176,10 @@ pub fn tokenizer_builder<'a>( #[cfg(test)] mod test { use bumpalo::Bump; + use bumparaw_collections::RawMap; use charabia::TokenizerBuilder; use meili_snap::snapshot; - use raw_collections::RawMap; + use rustc_hash::FxBuildHasher; use serde_json::json; use serde_json::value::RawValue; @@ -234,7 +235,7 @@ mod test { let bump = Bump::new(); let document: &RawValue = serde_json::from_str(&document).unwrap(); - let document = RawMap::from_raw_value(document, &bump).unwrap(); + let document = RawMap::from_raw_value_and_hasher(document, FxBuildHasher, &bump).unwrap(); let document = Versions::single(document); let document = DocumentFromVersions::new(&document); diff --git a/crates/milli/src/update/new/extract/vectors/mod.rs b/crates/milli/src/update/new/extract/vectors/mod.rs index 8ac73a8d7..2a72a1650 100644 --- a/crates/milli/src/update/new/extract/vectors/mod.rs +++ b/crates/milli/src/update/new/extract/vectors/mod.rs @@ -18,17 +18,17 @@ use crate::vector::error::{ use crate::vector::{Embedder, Embedding, EmbeddingConfigs}; use crate::{DocumentId, FieldDistribution, InternalError, Result, ThreadPoolNoAbort, UserError}; -pub struct EmbeddingExtractor<'a> { +pub struct EmbeddingExtractor<'a, 'b> { embedders: &'a EmbeddingConfigs, - sender: &'a EmbeddingSender<'a>, + sender: EmbeddingSender<'a, 'b>, possible_embedding_mistakes: PossibleEmbeddingMistakes, threads: &'a ThreadPoolNoAbort, } -impl<'a> EmbeddingExtractor<'a> { +impl<'a, 'b> EmbeddingExtractor<'a, 'b> { pub fn new( embedders: &'a EmbeddingConfigs, - sender: &'a EmbeddingSender<'a>, + sender: EmbeddingSender<'a, 'b>, field_distribution: &'a FieldDistribution, threads: &'a ThreadPoolNoAbort, ) -> Self { @@ -43,7 +43,7 @@ pub struct EmbeddingExtractorData<'extractor>( unsafe impl MostlySend for EmbeddingExtractorData<'_> {} -impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> { +impl<'a, 'b, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a, 'b> { type Data = RefCell>; fn init_data<'doc>(&'doc self, extractor_alloc: &'extractor Bump) -> crate::Result { @@ -130,6 +130,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> { ); } else if new_vectors.regenerate { let new_rendered = prompt.render_document( + update.external_document_id(), update.current( &context.rtxn, context.index, @@ -139,6 +140,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> { &context.doc_alloc, )?; let old_rendered = prompt.render_document( + update.external_document_id(), update.merged( &context.rtxn, context.index, @@ -158,6 +160,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> { } } else if old_vectors.regenerate { let old_rendered = prompt.render_document( + update.external_document_id(), update.current( &context.rtxn, context.index, @@ -167,6 +170,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> { &context.doc_alloc, )?; let new_rendered = prompt.render_document( + update.external_document_id(), update.merged( &context.rtxn, context.index, @@ -216,6 +220,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> { ); } else if new_vectors.regenerate { let rendered = prompt.render_document( + insertion.external_document_id(), insertion.inserted(), context.new_fields_ids_map, &context.doc_alloc, @@ -229,6 +234,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> { } } else { let rendered = prompt.render_document( + insertion.external_document_id(), insertion.inserted(), context.new_fields_ids_map, &context.doc_alloc, @@ -259,7 +265,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> { // Currently this is the case as: // 1. BVec are inside of the bumaplo // 2. All other fields are either trivial (u8) or references. -struct Chunks<'a, 'extractor> { +struct Chunks<'a, 'b, 'extractor> { texts: BVec<'a, &'a str>, ids: BVec<'a, DocumentId>, @@ -270,11 +276,11 @@ struct Chunks<'a, 'extractor> { possible_embedding_mistakes: &'a PossibleEmbeddingMistakes, user_provided: &'a RefCell>, threads: &'a ThreadPoolNoAbort, - sender: &'a EmbeddingSender<'a>, + sender: EmbeddingSender<'a, 'b>, has_manual_generation: Option<&'a str>, } -impl<'a, 'extractor> Chunks<'a, 'extractor> { +impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { #[allow(clippy::too_many_arguments)] pub fn new( embedder: &'a Embedder, @@ -284,7 +290,7 @@ impl<'a, 'extractor> Chunks<'a, 'extractor> { user_provided: &'a RefCell>, possible_embedding_mistakes: &'a PossibleEmbeddingMistakes, threads: &'a ThreadPoolNoAbort, - sender: &'a EmbeddingSender<'a>, + sender: EmbeddingSender<'a, 'b>, doc_alloc: &'a Bump, ) -> Self { let capacity = embedder.prompt_count_in_chunk_hint() * embedder.chunk_count_hint(); @@ -368,7 +374,7 @@ impl<'a, 'extractor> Chunks<'a, 'extractor> { possible_embedding_mistakes: &PossibleEmbeddingMistakes, unused_vectors_distribution: &UnusedVectorsDistributionBump, threads: &ThreadPoolNoAbort, - sender: &EmbeddingSender<'a>, + sender: EmbeddingSender<'a, 'b>, has_manual_generation: Option<&'a str>, ) -> Result<()> { if let Some(external_docid) = has_manual_generation { diff --git a/crates/milli/src/update/new/facet_search_builder.rs b/crates/milli/src/update/new/facet_search_builder.rs index 1993c1d00..d1ff6096d 100644 --- a/crates/milli/src/update/new/facet_search_builder.rs +++ b/crates/milli/src/update/new/facet_search_builder.rs @@ -103,6 +103,8 @@ impl<'indexer> FacetSearchBuilder<'indexer> { #[tracing::instrument(level = "trace", skip_all, target = "indexing::facet_fst")] pub fn merge_and_write(self, index: &Index, wtxn: &mut RwTxn, rtxn: &RoTxn) -> Result<()> { + tracing::trace!("merge facet strings for facet search: {:?}", self.registered_facets); + let reader = self.normalized_facet_string_docids_sorter.into_reader_cursors()?; let mut builder = grenad::MergerBuilder::new(MergeDeladdBtreesetString); builder.extend(reader); @@ -118,12 +120,15 @@ impl<'indexer> FacetSearchBuilder<'indexer> { BEU16StrCodec::bytes_decode(key).map_err(heed::Error::Encoding)?; if current_field_id != Some(field_id) { - if let Some(fst_merger_builder) = fst_merger_builder { + if let (Some(current_field_id), Some(fst_merger_builder)) = + (current_field_id, fst_merger_builder) + { let mmap = fst_merger_builder.build(&mut callback)?; - index - .facet_id_string_fst - .remap_data_type::() - .put(wtxn, &field_id, &mmap)?; + index.facet_id_string_fst.remap_data_type::().put( + wtxn, + ¤t_field_id, + &mmap, + )?; } fst = index.facet_id_string_fst.get(rtxn, &field_id)?; diff --git a/crates/milli/src/update/new/indexer/de.rs b/crates/milli/src/update/new/indexer/de.rs index c9808360e..4d9fa40a1 100644 --- a/crates/milli/src/update/new/indexer/de.rs +++ b/crates/milli/src/update/new/indexer/de.rs @@ -1,6 +1,8 @@ use std::ops::ControlFlow; use bumpalo::Bump; +use bumparaw_collections::RawVec; +use rustc_hash::FxBuildHasher; use serde::de::{DeserializeSeed, Deserializer as _, Visitor}; use serde_json::value::RawValue; @@ -360,7 +362,7 @@ impl<'a> DeserrRawValue<'a> { } pub struct DeserrRawVec<'a> { - vec: raw_collections::RawVec<'a>, + vec: RawVec<'a>, alloc: &'a Bump, } @@ -379,7 +381,7 @@ impl<'a> deserr::Sequence for DeserrRawVec<'a> { } pub struct DeserrRawVecIter<'a> { - it: raw_collections::vec::iter::IntoIter<'a>, + it: bumparaw_collections::vec::iter::IntoIter<'a>, alloc: &'a Bump, } @@ -393,7 +395,7 @@ impl<'a> Iterator for DeserrRawVecIter<'a> { } pub struct DeserrRawMap<'a> { - map: raw_collections::RawMap<'a>, + map: bumparaw_collections::RawMap<'a, FxBuildHasher>, alloc: &'a Bump, } @@ -416,7 +418,7 @@ impl<'a> deserr::Map for DeserrRawMap<'a> { } pub struct DeserrRawMapIter<'a> { - it: raw_collections::map::iter::IntoIter<'a>, + it: bumparaw_collections::map::iter::IntoIter<'a>, alloc: &'a Bump, } @@ -615,7 +617,7 @@ impl<'de> Visitor<'de> for DeserrRawValueVisitor<'de> { where A: serde::de::SeqAccess<'de>, { - let mut raw_vec = raw_collections::RawVec::new_in(self.alloc); + let mut raw_vec = RawVec::new_in(self.alloc); while let Some(next) = seq.next_element()? { raw_vec.push(next); } diff --git a/crates/milli/src/update/new/indexer/document_changes.rs b/crates/milli/src/update/new/indexer/document_changes.rs index bfb369680..a45fcee85 100644 --- a/crates/milli/src/update/new/indexer/document_changes.rs +++ b/crates/milli/src/update/new/indexer/document_changes.rs @@ -1,4 +1,5 @@ use std::cell::{Cell, RefCell}; +use std::sync::atomic::Ordering; use std::sync::{Arc, RwLock}; use bumpalo::Bump; @@ -7,8 +8,9 @@ use rayon::iter::IndexedParallelIterator; use super::super::document_change::DocumentChange; use crate::fields_ids_map::metadata::FieldIdMapWithMetadata; +use crate::progress::{AtomicDocumentStep, Progress}; use crate::update::new::parallel_iterator_ext::ParallelIteratorExt as _; -use crate::update::new::steps::Step; +use crate::update::new::steps::IndexingStep; use crate::update::new::thread_local::{FullySend, MostlySend, ThreadLocal}; use crate::{FieldsIdsMap, GlobalFieldsIdsMap, Index, InternalError, Result}; @@ -70,7 +72,7 @@ impl< F: FnOnce(&'extractor Bump) -> Result, { let doc_alloc = - doc_allocs.get_or(|| FullySend(Cell::new(Bump::with_capacity(1024 * 1024 * 1024)))); + doc_allocs.get_or(|| FullySend(Cell::new(Bump::with_capacity(1024 * 1024)))); let doc_alloc = doc_alloc.0.take(); let fields_ids_map = fields_ids_map_store .get_or(|| RefCell::new(GlobalFieldsIdsMap::new(new_fields_ids_map)).into()); @@ -133,10 +135,8 @@ pub struct IndexingContext< 'indexer, // covariant lifetime of objects that are borrowed during the entire indexing operation 'index, // covariant lifetime of the index MSP, - SP, > where MSP: Fn() -> bool + Sync, - SP: Fn(Progress) + Sync, { pub index: &'index Index, pub db_fields_ids_map: &'indexer FieldsIdsMap, @@ -144,7 +144,7 @@ pub struct IndexingContext< pub doc_allocs: &'indexer ThreadLocal>>, pub fields_ids_map_store: &'indexer ThreadLocal>>>, pub must_stop_processing: &'indexer MSP, - pub send_progress: &'indexer SP, + pub progress: &'indexer Progress, } impl< @@ -152,18 +152,15 @@ impl< 'indexer, // covariant lifetime of objects that are borrowed during the entire indexing operation 'index, // covariant lifetime of the index MSP, - SP, > Copy for IndexingContext< 'fid, // invariant lifetime of fields ids map 'indexer, // covariant lifetime of objects that are borrowed during the entire indexing operation 'index, // covariant lifetime of the index MSP, - SP, > where MSP: Fn() -> bool + Sync, - SP: Fn(Progress) + Sync, { } @@ -172,18 +169,15 @@ impl< 'indexer, // covariant lifetime of objects that are borrowed during the entire indexing operation 'index, // covariant lifetime of the index MSP, - SP, > Clone for IndexingContext< 'fid, // invariant lifetime of fields ids map 'indexer, // covariant lifetime of objects that are borrowed during the entire indexing operation 'index, // covariant lifetime of the index MSP, - SP, > where MSP: Fn() -> bool + Sync, - SP: Fn(Progress) + Sync, { fn clone(&self) -> Self { *self @@ -202,7 +196,6 @@ pub fn extract< EX, DC: DocumentChanges<'pl>, MSP, - SP, >( document_changes: &DC, extractor: &EX, @@ -213,18 +206,18 @@ pub fn extract< doc_allocs, fields_ids_map_store, must_stop_processing, - send_progress, - }: IndexingContext<'fid, 'indexer, 'index, MSP, SP>, + progress, + }: IndexingContext<'fid, 'indexer, 'index, MSP>, extractor_allocs: &'extractor mut ThreadLocal>, datastore: &'data ThreadLocal, - step: Step, + step: IndexingStep, ) -> Result<()> where EX: Extractor<'extractor>, MSP: Fn() -> bool + Sync, - SP: Fn(Progress) + Sync, { tracing::trace!("We are resetting the extractor allocators"); + progress.update_progress(step); // Clean up and reuse the extractor allocs for extractor_alloc in extractor_allocs.iter_mut() { tracing::trace!("\tWith {} bytes reset", extractor_alloc.0.allocated_bytes()); @@ -232,9 +225,11 @@ where } let total_documents = document_changes.len() as u32; + let (step, progress_step) = AtomicDocumentStep::new(total_documents); + progress.update_progress(progress_step); let pi = document_changes.iter(CHUNK_SIZE); - pi.enumerate().try_arc_for_each_try_init( + pi.try_arc_for_each_try_init( || { DocumentChangeContext::new( index, @@ -247,13 +242,10 @@ where move |index_alloc| extractor.init_data(index_alloc), ) }, - |context, (finished_documents, items)| { + |context, items| { if (must_stop_processing)() { return Err(Arc::new(InternalError::AbortedIndexation.into())); } - let finished_documents = (finished_documents * CHUNK_SIZE) as u32; - - (send_progress)(Progress::from_step_substep(step, finished_documents, total_documents)); // Clean up and reuse the document-specific allocator context.doc_alloc.reset(); @@ -264,6 +256,7 @@ where }); let res = extractor.process(changes, context).map_err(Arc::new); + step.fetch_add(items.as_ref().len() as u32, Ordering::Relaxed); // send back the doc_alloc in the pool context.doc_allocs.get_or_default().0.set(std::mem::take(&mut context.doc_alloc)); @@ -271,32 +264,7 @@ where res }, )?; - - (send_progress)(Progress::from_step_substep(step, total_documents, total_documents)); + step.store(total_documents, Ordering::Relaxed); Ok(()) } - -pub struct Progress { - pub finished_steps: u16, - pub total_steps: u16, - pub step_name: &'static str, - pub finished_total_substep: Option<(u32, u32)>, -} - -impl Progress { - pub fn from_step(step: Step) -> Self { - Self { - finished_steps: step.finished_steps(), - total_steps: Step::total_steps(), - step_name: step.name(), - finished_total_substep: None, - } - } - pub fn from_step_substep(step: Step, finished_substep: u32, total_substep: u32) -> Self { - Self { - finished_total_substep: Some((finished_substep, total_substep)), - ..Progress::from_step(step) - } - } -} diff --git a/crates/milli/src/update/new/indexer/document_deletion.rs b/crates/milli/src/update/new/indexer/document_deletion.rs index 518786e6f..b42a6c859 100644 --- a/crates/milli/src/update/new/indexer/document_deletion.rs +++ b/crates/milli/src/update/new/indexer/document_deletion.rs @@ -92,11 +92,12 @@ mod test { use crate::fields_ids_map::metadata::{FieldIdMapWithMetadata, MetadataBuilder}; use crate::index::tests::TempIndex; + use crate::progress::Progress; use crate::update::new::indexer::document_changes::{ extract, DocumentChangeContext, Extractor, IndexingContext, }; use crate::update::new::indexer::DocumentDeletion; - use crate::update::new::steps::Step; + use crate::update::new::steps::IndexingStep; use crate::update::new::thread_local::{MostlySend, ThreadLocal}; use crate::update::new::DocumentChange; use crate::DocumentId; @@ -164,7 +165,7 @@ mod test { doc_allocs: &doc_allocs, fields_ids_map_store: &fields_ids_map_store, must_stop_processing: &(|| false), - send_progress: &(|_progress| {}), + progress: &Progress::default(), }; for _ in 0..3 { @@ -176,7 +177,7 @@ mod test { context, &mut extractor_allocs, &datastore, - Step::ExtractingDocuments, + IndexingStep::ExtractingDocuments, ) .unwrap(); diff --git a/crates/milli/src/update/new/indexer/document_operation.rs b/crates/milli/src/update/new/indexer/document_operation.rs index 2a381d5d1..090c1eb8e 100644 --- a/crates/milli/src/update/new/indexer/document_operation.rs +++ b/crates/milli/src/update/new/indexer/document_operation.rs @@ -1,19 +1,23 @@ +use std::sync::atomic::Ordering; + use bumpalo::collections::CollectIn; use bumpalo::Bump; +use bumparaw_collections::RawMap; use hashbrown::hash_map::Entry; use heed::RoTxn; use memmap2::Mmap; -use raw_collections::RawMap; use rayon::slice::ParallelSlice; +use rustc_hash::FxBuildHasher; use serde_json::value::RawValue; use serde_json::Deserializer; use super::super::document_change::DocumentChange; -use super::document_changes::{DocumentChangeContext, DocumentChanges, Progress}; +use super::document_changes::{DocumentChangeContext, DocumentChanges}; use super::retrieve_or_guess_primary_key; use crate::documents::PrimaryKey; +use crate::progress::{AtomicPayloadStep, Progress}; use crate::update::new::document::Versions; -use crate::update::new::steps::Step; +use crate::update::new::steps::IndexingStep; use crate::update::new::thread_local::MostlySend; use crate::update::new::{Deletion, Insertion, Update}; use crate::update::{AvailableIds, IndexDocumentsMethod}; @@ -44,7 +48,7 @@ impl<'pl> DocumentOperation<'pl> { #[allow(clippy::too_many_arguments)] #[tracing::instrument(level = "trace", skip_all, target = "indexing::document_operation")] - pub fn into_changes( + pub fn into_changes( self, indexer: &'pl Bump, index: &Index, @@ -52,12 +56,12 @@ impl<'pl> DocumentOperation<'pl> { primary_key_from_op: Option<&'pl str>, new_fields_ids_map: &mut FieldsIdsMap, must_stop_processing: &MSP, - send_progress: &SP, + progress: Progress, ) -> Result<(DocumentOperationChanges<'pl>, Vec, Option>)> where MSP: Fn() -> bool, - SP: Fn(Progress), { + progress.update_progress(IndexingStep::PreparingPayloads); let Self { operations, method } = self; let documents_ids = index.documents_ids(rtxn)?; @@ -67,16 +71,14 @@ impl<'pl> DocumentOperation<'pl> { let mut primary_key = None; let payload_count = operations.len(); + let (step, progress_step) = AtomicPayloadStep::new(payload_count as u32); + progress.update_progress(progress_step); for (payload_index, operation) in operations.into_iter().enumerate() { if must_stop_processing() { return Err(InternalError::AbortedIndexation.into()); } - send_progress(Progress::from_step_substep( - Step::PreparingPayloads, - payload_index as u32, - payload_count as u32, - )); + step.store(payload_index as u32, Ordering::Relaxed); let mut bytes = 0; let result = match operation { @@ -117,12 +119,7 @@ impl<'pl> DocumentOperation<'pl> { }; operations_stats.push(PayloadStats { document_count, bytes, error }); } - - send_progress(Progress::from_step_substep( - Step::PreparingPayloads, - payload_count as u32, - payload_count as u32, - )); + step.store(payload_count as u32, Ordering::Relaxed); // TODO We must drain the HashMap into a Vec because rayon::hash_map::IntoIter: !Clone let mut docids_version_offsets: bumpalo::collections::vec::Vec<_> = @@ -166,8 +163,9 @@ fn extract_addition_payload_changes<'r, 'pl: 'r>( // Only guess the primary key if it is the first document let retrieved_primary_key = if previous_offset == 0 { - let doc = - RawMap::from_raw_value(doc, indexer).map(Some).map_err(UserError::SerdeJson)?; + let doc = RawMap::from_raw_value_and_hasher(doc, FxBuildHasher, indexer) + .map(Some) + .map_err(UserError::SerdeJson)?; let result = retrieve_or_guess_primary_key( rtxn, @@ -545,8 +543,9 @@ impl MergeChanges for MergeDocumentForReplacement { match operations.last() { Some(InnerDocOp::Addition(DocumentOffset { content })) => { let document = serde_json::from_slice(content).unwrap(); - let document = raw_collections::RawMap::from_raw_value(document, doc_alloc) - .map_err(UserError::SerdeJson)?; + let document = + RawMap::from_raw_value_and_hasher(document, FxBuildHasher, doc_alloc) + .map_err(UserError::SerdeJson)?; if is_new { Ok(Some(DocumentChange::Insertion(Insertion::create( @@ -632,8 +631,9 @@ impl MergeChanges for MergeDocumentForUpdates { } }; let document = serde_json::from_slice(content).unwrap(); - let document = raw_collections::RawMap::from_raw_value(document, doc_alloc) - .map_err(UserError::SerdeJson)?; + let document = + RawMap::from_raw_value_and_hasher(document, FxBuildHasher, doc_alloc) + .map_err(UserError::SerdeJson)?; Some(Versions::single(document)) } @@ -647,8 +647,9 @@ impl MergeChanges for MergeDocumentForUpdates { }; let document = serde_json::from_slice(content).unwrap(); - let document = raw_collections::RawMap::from_raw_value(document, doc_alloc) - .map_err(UserError::SerdeJson)?; + let document = + RawMap::from_raw_value_and_hasher(document, FxBuildHasher, doc_alloc) + .map_err(UserError::SerdeJson)?; Ok(document) }); Versions::multiple(versions)? diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index 01ac26503..a850c0d03 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -1,9 +1,11 @@ use std::cmp::Ordering; +use std::sync::atomic::AtomicBool; use std::sync::{OnceLock, RwLock}; use std::thread::{self, Builder}; use big_s::S; -use document_changes::{extract, DocumentChanges, IndexingContext, Progress}; +use bumparaw_collections::RawMap; +use document_changes::{extract, DocumentChanges, IndexingContext}; pub use document_deletion::DocumentDeletion; pub use document_operation::{DocumentOperation, PayloadStats}; use hashbrown::HashMap; @@ -12,7 +14,7 @@ use heed::{RoTxn, RwTxn}; use itertools::{merge_join_by, EitherOrBoth}; pub use partial_dump::PartialDump; use rand::SeedableRng as _; -use raw_collections::RawMap; +use rustc_hash::FxBuildHasher; use time::OffsetDateTime; pub use update_by_function::UpdateByFunction; @@ -20,7 +22,7 @@ use super::channel::*; use super::extract::*; use super::facet_search_builder::FacetSearchBuilder; use super::merger::FacetFieldIdsDelta; -use super::steps::Step; +use super::steps::IndexingStep; use super::thread_local::ThreadLocal; use super::word_fst_builder::{PrefixData, PrefixDelta, WordFstBuilder}; use super::words_prefix_docids::{ @@ -31,6 +33,7 @@ use crate::documents::{PrimaryKey, DEFAULT_PRIMARY_KEY}; use crate::facet::FacetType; use crate::fields_ids_map::metadata::{FieldIdMapWithMetadata, MetadataBuilder}; use crate::index::main_key::{WORDS_FST_KEY, WORDS_PREFIXES_FST_KEY}; +use crate::progress::Progress; use crate::proximity::ProximityPrecision; use crate::update::del_add::DelAdd; use crate::update::new::extract::EmbeddingExtractor; @@ -41,7 +44,7 @@ use crate::update::settings::InnerIndexSettings; use crate::update::{FacetsUpdateBulk, GrenadParameters}; use crate::vector::{ArroyWrapper, EmbeddingConfigs, Embeddings}; use crate::{ - FieldsIdsMap, GlobalFieldsIdsMap, Index, InternalError, Result, ThreadPoolNoAbort, + Error, FieldsIdsMap, GlobalFieldsIdsMap, Index, InternalError, Result, ThreadPoolNoAbort, ThreadPoolNoAbortBuilder, UserError, }; @@ -58,9 +61,10 @@ mod update_by_function; /// /// TODO return stats #[allow(clippy::too_many_arguments)] // clippy: 😝 -pub fn index<'pl, 'indexer, 'index, DC, MSP, SP>( +pub fn index<'pl, 'indexer, 'index, DC, MSP>( wtxn: &mut RwTxn, index: &'index Index, + pool: &ThreadPoolNoAbort, grenad_parameters: GrenadParameters, db_fields_ids_map: &'indexer FieldsIdsMap, new_fields_ids_map: FieldsIdsMap, @@ -68,14 +72,44 @@ pub fn index<'pl, 'indexer, 'index, DC, MSP, SP>( document_changes: &DC, embedders: EmbeddingConfigs, must_stop_processing: &'indexer MSP, - send_progress: &'indexer SP, + progress: &'indexer Progress, ) -> Result<()> where DC: DocumentChanges<'pl>, MSP: Fn() -> bool + Sync, - SP: Fn(Progress) + Sync, { - let (extractor_sender, writer_receiver) = extractor_writer_channel(10_000); + let mut bbbuffers = Vec::new(); + let finished_extraction = AtomicBool::new(false); + + // We reduce the actual memory used to 5%. The reason we do this here and not in Meilisearch + // is because we still use the old indexer for the settings and it is highly impacted by the + // max memory. So we keep the changes here and will remove these changes once we use the new + // indexer to also index settings. Related to #5125 and #5141. + let grenad_parameters = GrenadParameters { + max_memory: grenad_parameters.max_memory.map(|mm| mm * 5 / 100), + ..grenad_parameters + }; + + // We compute and remove the allocated BBQueues buffers capacity from the indexing memory. + let minimum_capacity = 50 * 1024 * 1024 * pool.current_num_threads(); // 50 MiB + let (grenad_parameters, total_bbbuffer_capacity) = grenad_parameters.max_memory.map_or( + (grenad_parameters, 2 * minimum_capacity), // 100 MiB by thread by default + |max_memory| { + // 2% of the indexing memory + let total_bbbuffer_capacity = (max_memory / 100 / 2).max(minimum_capacity); + let new_grenad_parameters = GrenadParameters { + max_memory: Some( + max_memory.saturating_sub(total_bbbuffer_capacity).max(100 * 1024 * 1024), + ), + ..grenad_parameters + }; + (new_grenad_parameters, total_bbbuffer_capacity) + }, + ); + + let (extractor_sender, mut writer_receiver) = pool + .install(|| extractor_writer_bbqueue(&mut bbbuffers, total_bbbuffer_capacity, 1000)) + .unwrap(); let metadata_builder = MetadataBuilder::from_index(index, wtxn)?; let new_fields_ids_map = FieldIdMapWithMetadata::new(new_fields_ids_map, metadata_builder); @@ -91,244 +125,274 @@ where doc_allocs: &doc_allocs, fields_ids_map_store: &fields_ids_map_store, must_stop_processing, - send_progress, + progress, }; + let mut index_embeddings = index.embedding_configs(wtxn)?; let mut field_distribution = index.field_distribution(wtxn)?; let mut document_ids = index.documents_ids(wtxn)?; thread::scope(|s| -> Result<()> { let indexer_span = tracing::Span::current(); let embedders = &embedders; + let finished_extraction = &finished_extraction; // prevent moving the field_distribution and document_ids in the inner closure... let field_distribution = &mut field_distribution; let document_ids = &mut document_ids; let extractor_handle = Builder::new().name(S("indexer-extractors")).spawn_scoped(s, move || { - let span = tracing::trace_span!(target: "indexing::documents", parent: &indexer_span, "extract"); - let _entered = span.enter(); - - let rtxn = index.read_txn()?; - - // document but we need to create a function that collects and compresses documents. - let document_sender = extractor_sender.documents(); - let document_extractor = DocumentsExtractor::new(&document_sender, embedders); - let datastore = ThreadLocal::with_capacity(rayon::current_num_threads()); - - extract(document_changes, - &document_extractor, - indexing_context, - &mut extractor_allocs, - &datastore, - Step::ExtractingDocuments, - )?; - - for document_extractor_data in datastore { - let document_extractor_data = document_extractor_data.0.into_inner(); - for (field, delta) in document_extractor_data.field_distribution_delta { - let current = field_distribution.entry(field).or_default(); - // adding the delta should never cause a negative result, as we are removing fields that previously existed. - *current = current.saturating_add_signed(delta); - } - document_extractor_data.docids_delta.apply_to(document_ids); - } - - field_distribution.retain(|_, v| *v != 0); - - let facet_field_ids_delta; - - { - let span = tracing::trace_span!(target: "indexing::documents::extract", "faceted"); + pool.install(move || { + let span = tracing::trace_span!(target: "indexing::documents", parent: &indexer_span, "extract"); let _entered = span.enter(); - facet_field_ids_delta = merge_and_send_facet_docids( - FacetedDocidsExtractor::run_extraction( - grenad_parameters, + let rtxn = index.read_txn()?; + + // document but we need to create a function that collects and compresses documents. + let document_sender = extractor_sender.documents(); + let document_extractor = DocumentsExtractor::new(document_sender, embedders); + let datastore = ThreadLocal::with_capacity(rayon::current_num_threads()); + { + let span = tracing::trace_span!(target: "indexing::documents::extract", parent: &indexer_span, "documents"); + let _entered = span.enter(); + extract( document_changes, + &document_extractor, indexing_context, &mut extractor_allocs, - &extractor_sender.field_id_docid_facet_sender(), - Step::ExtractingFacets - )?, - FacetDatabases::new(index), - index, - extractor_sender.facet_docids(), - )?; - } - - { - let span = tracing::trace_span!(target: "indexing::documents::extract", "word_docids"); - let _entered = span.enter(); - - - let WordDocidsCaches { - word_docids, - word_fid_docids, - exact_word_docids, - word_position_docids, - fid_word_count_docids, - } = WordDocidsExtractors::run_extraction( - grenad_parameters, - document_changes, - indexing_context, - &mut extractor_allocs, - Step::ExtractingWords - )?; - - // TODO Word Docids Merger - { - let span = tracing::trace_span!(target: "indexing::documents::merge", "word_docids"); - let _entered = span.enter(); - merge_and_send_docids( - word_docids, - index.word_docids.remap_types(), - index, - extractor_sender.docids::(), - &indexing_context.must_stop_processing, + &datastore, + IndexingStep::ExtractingDocuments, )?; } - - // Word Fid Docids Merging { - let span = tracing::trace_span!(target: "indexing::documents::merge", "word_fid_docids"); + let span = tracing::trace_span!(target: "indexing::documents::merge", parent: &indexer_span, "documents"); let _entered = span.enter(); - merge_and_send_docids( - word_fid_docids, - index.word_fid_docids.remap_types(), - index, - extractor_sender.docids::(), - &indexing_context.must_stop_processing, - )?; + for document_extractor_data in datastore { + let document_extractor_data = document_extractor_data.0.into_inner(); + for (field, delta) in document_extractor_data.field_distribution_delta { + let current = field_distribution.entry(field).or_default(); + // adding the delta should never cause a negative result, as we are removing fields that previously existed. + *current = current.saturating_add_signed(delta); + } + document_extractor_data.docids_delta.apply_to(document_ids); + } + + field_distribution.retain(|_, v| *v != 0); } - // Exact Word Docids Merging + let facet_field_ids_delta; + { - let span = tracing::trace_span!(target: "indexing::documents::merge", "exact_word_docids"); - let _entered = span.enter(); - merge_and_send_docids( - exact_word_docids, - index.exact_word_docids.remap_types(), - index, - extractor_sender.docids::(), - &indexing_context.must_stop_processing, - )?; - } + let caches = { + let span = tracing::trace_span!(target: "indexing::documents::extract", parent: &indexer_span, "faceted"); + let _entered = span.enter(); - // Word Position Docids Merging - { - let span = tracing::trace_span!(target: "indexing::documents::merge", "word_position_docids"); - let _entered = span.enter(); - merge_and_send_docids( - word_position_docids, - index.word_position_docids.remap_types(), - index, - extractor_sender.docids::(), - &indexing_context.must_stop_processing, - )?; - } + FacetedDocidsExtractor::run_extraction( + grenad_parameters, + document_changes, + indexing_context, + &mut extractor_allocs, + &extractor_sender.field_id_docid_facet_sender(), + IndexingStep::ExtractingFacets + )? + }; - // Fid Word Count Docids Merging - { - let span = tracing::trace_span!(target: "indexing::documents::merge", "fid_word_count_docids"); - let _entered = span.enter(); - merge_and_send_docids( - fid_word_count_docids, - index.field_id_word_count_docids.remap_types(), - index, - extractor_sender.docids::(), - &indexing_context.must_stop_processing, - )?; - } - } + { + let span = tracing::trace_span!(target: "indexing::documents::merge", parent: &indexer_span, "faceted"); + let _entered = span.enter(); - // run the proximity extraction only if the precision is by word - // this works only if the settings didn't change during this transaction. - let proximity_precision = index.proximity_precision(&rtxn)?.unwrap_or_default(); - if proximity_precision == ProximityPrecision::ByWord { - let span = tracing::trace_span!(target: "indexing::documents::extract", "word_pair_proximity_docids"); - let _entered = span.enter(); - - - let caches = ::run_extraction( - grenad_parameters, - document_changes, - indexing_context, - &mut extractor_allocs, - Step::ExtractingWordProximity, - )?; - - merge_and_send_docids( - caches, - index.word_pair_proximity_docids.remap_types(), - index, - extractor_sender.docids::(), - &indexing_context.must_stop_processing, - )?; - } - - 'vectors: { - let span = tracing::trace_span!(target: "indexing::documents::extract", "vectors"); - let _entered = span.enter(); - - let mut index_embeddings = index.embedding_configs(&rtxn)?; - if index_embeddings.is_empty() { - break 'vectors; - } - - let embedding_sender = extractor_sender.embeddings(); - let extractor = EmbeddingExtractor::new(embedders, &embedding_sender, field_distribution, request_threads()); - let mut datastore = ThreadLocal::with_capacity(rayon::current_num_threads()); - extract(document_changes, &extractor, indexing_context, &mut extractor_allocs, &datastore, Step::ExtractingEmbeddings)?; - - for config in &mut index_embeddings { - 'data: for data in datastore.iter_mut() { - let data = &mut data.get_mut().0; - let Some(deladd) = data.remove(&config.name) else { continue 'data; }; - deladd.apply_to(&mut config.user_provided); + facet_field_ids_delta = merge_and_send_facet_docids( + caches, + FacetDatabases::new(index), + index, + extractor_sender.facet_docids(), + )?; } } - embedding_sender.finish(index_embeddings).unwrap(); - } + { + let WordDocidsCaches { + word_docids, + word_fid_docids, + exact_word_docids, + word_position_docids, + fid_word_count_docids, + } = { + let span = tracing::trace_span!(target: "indexing::documents::extract", "word_docids"); + let _entered = span.enter(); - 'geo: { - let span = tracing::trace_span!(target: "indexing::documents::extract", "geo"); - let _entered = span.enter(); + WordDocidsExtractors::run_extraction( + grenad_parameters, + document_changes, + indexing_context, + &mut extractor_allocs, + IndexingStep::ExtractingWords + )? + }; - let Some(extractor) = GeoExtractor::new(&rtxn, index, grenad_parameters)? else { - break 'geo; - }; - let datastore = ThreadLocal::with_capacity(rayon::current_num_threads()); - extract( - document_changes, - &extractor, - indexing_context, - &mut extractor_allocs, - &datastore, - Step::WritingGeoPoints - )?; + { + let span = tracing::trace_span!(target: "indexing::documents::merge", "word_docids"); + let _entered = span.enter(); + merge_and_send_docids( + word_docids, + index.word_docids.remap_types(), + index, + extractor_sender.docids::(), + &indexing_context.must_stop_processing, + )?; + } - merge_and_send_rtree( - datastore, - &rtxn, - index, - extractor_sender.geo(), - &indexing_context.must_stop_processing, - )?; - } + { + let span = tracing::trace_span!(target: "indexing::documents::merge", "word_fid_docids"); + let _entered = span.enter(); + merge_and_send_docids( + word_fid_docids, + index.word_fid_docids.remap_types(), + index, + extractor_sender.docids::(), + &indexing_context.must_stop_processing, + )?; + } - { - let span = tracing::trace_span!(target: "indexing::documents::extract", "FINISH"); - let _entered = span.enter(); - (indexing_context.send_progress)(Progress::from_step(Step::WritingToDatabase)); - } + { + let span = tracing::trace_span!(target: "indexing::documents::merge", "exact_word_docids"); + let _entered = span.enter(); + merge_and_send_docids( + exact_word_docids, + index.exact_word_docids.remap_types(), + index, + extractor_sender.docids::(), + &indexing_context.must_stop_processing, + )?; + } - Result::Ok(facet_field_ids_delta) + { + let span = tracing::trace_span!(target: "indexing::documents::merge", "word_position_docids"); + let _entered = span.enter(); + merge_and_send_docids( + word_position_docids, + index.word_position_docids.remap_types(), + index, + extractor_sender.docids::(), + &indexing_context.must_stop_processing, + )?; + } + + { + let span = tracing::trace_span!(target: "indexing::documents::merge", "fid_word_count_docids"); + let _entered = span.enter(); + merge_and_send_docids( + fid_word_count_docids, + index.field_id_word_count_docids.remap_types(), + index, + extractor_sender.docids::(), + &indexing_context.must_stop_processing, + )?; + } + } + + // run the proximity extraction only if the precision is by word + // this works only if the settings didn't change during this transaction. + let proximity_precision = index.proximity_precision(&rtxn)?.unwrap_or_default(); + if proximity_precision == ProximityPrecision::ByWord { + let caches = { + let span = tracing::trace_span!(target: "indexing::documents::extract", "word_pair_proximity_docids"); + let _entered = span.enter(); + + ::run_extraction( + grenad_parameters, + document_changes, + indexing_context, + &mut extractor_allocs, + IndexingStep::ExtractingWordProximity, + )? + }; + + { + let span = tracing::trace_span!(target: "indexing::documents::merge", "word_pair_proximity_docids"); + let _entered = span.enter(); + + merge_and_send_docids( + caches, + index.word_pair_proximity_docids.remap_types(), + index, + extractor_sender.docids::(), + &indexing_context.must_stop_processing, + )?; + } + } + + 'vectors: { + if index_embeddings.is_empty() { + break 'vectors; + } + + let embedding_sender = extractor_sender.embeddings(); + let extractor = EmbeddingExtractor::new(embedders, embedding_sender, field_distribution, request_threads()); + let mut datastore = ThreadLocal::with_capacity(rayon::current_num_threads()); + { + let span = tracing::trace_span!(target: "indexing::documents::extract", "vectors"); + let _entered = span.enter(); + + extract( + document_changes, + &extractor, + indexing_context, + &mut extractor_allocs, + &datastore, + IndexingStep::ExtractingEmbeddings, + )?; + } + { + let span = tracing::trace_span!(target: "indexing::documents::merge", "vectors"); + let _entered = span.enter(); + + for config in &mut index_embeddings { + 'data: for data in datastore.iter_mut() { + let data = &mut data.get_mut().0; + let Some(deladd) = data.remove(&config.name) else { continue 'data; }; + deladd.apply_to(&mut config.user_provided); + } + } + } + } + + 'geo: { + let Some(extractor) = GeoExtractor::new(&rtxn, index, grenad_parameters)? else { + break 'geo; + }; + let datastore = ThreadLocal::with_capacity(rayon::current_num_threads()); + + { + let span = tracing::trace_span!(target: "indexing::documents::extract", "geo"); + let _entered = span.enter(); + + extract( + document_changes, + &extractor, + indexing_context, + &mut extractor_allocs, + &datastore, + IndexingStep::WritingGeoPoints + )?; + } + + merge_and_send_rtree( + datastore, + &rtxn, + index, + extractor_sender.geo(), + &indexing_context.must_stop_processing, + )?; + } + indexing_context.progress.update_progress(IndexingStep::WritingToDatabase); + finished_extraction.store(true, std::sync::atomic::Ordering::Relaxed); + + Result::Ok((facet_field_ids_delta, index_embeddings)) + }).unwrap() })?; let global_fields_ids_map = GlobalFieldsIdsMap::new(&new_fields_ids_map); let vector_arroy = index.vector_arroy; - let mut rng = rand::rngs::StdRng::seed_from_u64(42); let indexer_span = tracing::Span::current(); let arroy_writers: Result> = embedders .inner_as_ref() @@ -351,94 +415,116 @@ where }) .collect(); + // Used by by the ArroySetVector to copy the embedding into an + // aligned memory area, required by arroy to accept a new vector. + let mut aligned_embedding = Vec::new(); let mut arroy_writers = arroy_writers?; - for operation in writer_receiver { - match operation { - WriterOperation::DbOperation(db_operation) => { - let database = db_operation.database(index); - match db_operation.entry() { - EntryOperation::Delete(e) => { - if !database.delete(wtxn, e.entry())? { - unreachable!("We tried to delete an unknown key") - } - } - EntryOperation::Write(e) => database.put(wtxn, e.key(), e.value())?, - } + + { + let span = tracing::trace_span!(target: "indexing::write_db", "all"); + let _entered = span.enter(); + + let span = tracing::trace_span!(target: "indexing::write_db", "post_merge"); + let mut _entered_post_merge = None; + + while let Some(action) = writer_receiver.recv_action() { + if _entered_post_merge.is_none() + && finished_extraction.load(std::sync::atomic::Ordering::Relaxed) + { + _entered_post_merge = Some(span.enter()); } - WriterOperation::ArroyOperation(arroy_operation) => match arroy_operation { - ArroyOperation::DeleteVectors { docid } => { - for (_embedder_index, (_embedder_name, _embedder, writer, dimensions)) in - &mut arroy_writers - { - let dimensions = *dimensions; - writer.del_items(wtxn, dimensions, docid)?; + + match action { + ReceiverAction::WakeUp => (), + ReceiverAction::LargeEntry(LargeEntry { database, key, value }) => { + let database_name = database.database_name(); + let database = database.database(index); + if let Err(error) = database.put(wtxn, &key, &value) { + return Err(Error::InternalError(InternalError::StorePut { + database_name, + key: bstr::BString::from(&key[..]), + value_length: value.len(), + error, + })); } } - ArroyOperation::SetVectors { - docid, - embedder_id, - embeddings: raw_embeddings, - } => { + ReceiverAction::LargeVectors(large_vectors) => { + let LargeVectors { docid, embedder_id, .. } = large_vectors; let (_, _, writer, dimensions) = arroy_writers.get(&embedder_id).expect("requested a missing embedder"); - // TODO: switch to Embeddings let mut embeddings = Embeddings::new(*dimensions); - for embedding in raw_embeddings { - embeddings.append(embedding).unwrap(); + for embedding in large_vectors.read_embeddings(*dimensions) { + embeddings.push(embedding.to_vec()).unwrap(); } - writer.del_items(wtxn, *dimensions, docid)?; writer.add_items(wtxn, docid, &embeddings)?; } - ArroyOperation::SetVector { docid, embedder_id, embedding } => { - let (_, _, writer, dimensions) = - arroy_writers.get(&embedder_id).expect("requested a missing embedder"); - writer.del_items(wtxn, *dimensions, docid)?; - writer.add_item(wtxn, docid, &embedding)?; - } - ArroyOperation::Finish { configs } => { - let span = tracing::trace_span!(target: "indexing::vectors", parent: &indexer_span, "build"); - let _entered = span.enter(); + } - (indexing_context.send_progress)(Progress::from_step( - Step::WritingEmbeddingsToDatabase, - )); - - for (_embedder_index, (_embedder_name, _embedder, writer, dimensions)) in - &mut arroy_writers - { - let dimensions = *dimensions; - writer.build_and_quantize( - wtxn, - &mut rng, - dimensions, - false, - &indexing_context.must_stop_processing, - )?; - } - - index.put_embedding_configs(wtxn, configs)?; - } - }, + // Every time the is a message in the channel we search + // for new entries in the BBQueue buffers. + write_from_bbqueue( + &mut writer_receiver, + index, + wtxn, + &arroy_writers, + &mut aligned_embedding, + )?; } + + // Once the extractor/writer channel is closed + // we must process the remaining BBQueue messages. + write_from_bbqueue( + &mut writer_receiver, + index, + wtxn, + &arroy_writers, + &mut aligned_embedding, + )?; } - (indexing_context.send_progress)(Progress::from_step(Step::WaitingForExtractors)); + indexing_context.progress.update_progress(IndexingStep::WaitingForExtractors); - let facet_field_ids_delta = extractor_handle.join().unwrap()?; + let (facet_field_ids_delta, index_embeddings) = extractor_handle.join().unwrap()?; - (indexing_context.send_progress)(Progress::from_step(Step::PostProcessingFacets)); + 'vectors: { + let span = + tracing::trace_span!(target: "indexing::vectors", parent: &indexer_span, "build"); + let _entered = span.enter(); + + if index_embeddings.is_empty() { + break 'vectors; + } + + indexing_context.progress.update_progress(IndexingStep::WritingEmbeddingsToDatabase); + let mut rng = rand::rngs::StdRng::seed_from_u64(42); + for (_index, (_embedder_name, _embedder, writer, dimensions)) in &mut arroy_writers { + let dimensions = *dimensions; + writer.build_and_quantize( + wtxn, + &mut rng, + dimensions, + false, + &indexing_context.must_stop_processing, + )?; + } + + index.put_embedding_configs(wtxn, index_embeddings)?; + } + + indexing_context.progress.update_progress(IndexingStep::PostProcessingFacets); + if index.facet_search(wtxn)? { + compute_facet_search_database(index, wtxn, global_fields_ids_map)?; + } - compute_facet_search_database(index, wtxn, global_fields_ids_map)?; compute_facet_level_database(index, wtxn, facet_field_ids_delta)?; - (indexing_context.send_progress)(Progress::from_step(Step::PostProcessingWords)); - + indexing_context.progress.update_progress(IndexingStep::PostProcessingWords); if let Some(prefix_delta) = compute_word_fst(index, wtxn)? { compute_prefix_database(index, wtxn, prefix_delta, grenad_parameters)?; } - (indexing_context.send_progress)(Progress::from_step(Step::Finalizing)); + indexing_context.progress.update_progress(IndexingStep::Finalizing); Ok(()) as Result<_> })?; @@ -464,6 +550,72 @@ where Ok(()) } +/// A function dedicated to manage all the available BBQueue frames. +/// +/// It reads all the available frames, do the corresponding database operations +/// and stops when no frame are available. +fn write_from_bbqueue( + writer_receiver: &mut WriterBbqueueReceiver<'_>, + index: &Index, + wtxn: &mut RwTxn<'_>, + arroy_writers: &HashMap, + aligned_embedding: &mut Vec, +) -> crate::Result<()> { + while let Some(frame_with_header) = writer_receiver.recv_frame() { + match frame_with_header.header() { + EntryHeader::DbOperation(operation) => { + let database_name = operation.database.database_name(); + let database = operation.database.database(index); + let frame = frame_with_header.frame(); + match operation.key_value(frame) { + (key, Some(value)) => { + if let Err(error) = database.put(wtxn, key, value) { + return Err(Error::InternalError(InternalError::StorePut { + database_name, + key: key.into(), + value_length: value.len(), + error, + })); + } + } + (key, None) => match database.delete(wtxn, key) { + Ok(false) => { + unreachable!("We tried to delete an unknown key: {key:?}") + } + Ok(_) => (), + Err(error) => { + return Err(Error::InternalError(InternalError::StoreDeletion { + database_name, + key: key.into(), + error, + })); + } + }, + } + } + EntryHeader::ArroyDeleteVector(ArroyDeleteVector { docid }) => { + for (_index, (_name, _embedder, writer, dimensions)) in arroy_writers { + let dimensions = *dimensions; + writer.del_items(wtxn, dimensions, docid)?; + } + } + EntryHeader::ArroySetVectors(asvs) => { + let ArroySetVectors { docid, embedder_id, .. } = asvs; + let frame = frame_with_header.frame(); + let (_, _, writer, dimensions) = + arroy_writers.get(&embedder_id).expect("requested a missing embedder"); + let mut embeddings = Embeddings::new(*dimensions); + let all_embeddings = asvs.read_all_embeddings_into_vec(frame, aligned_embedding); + embeddings.append(all_embeddings.to_vec()).unwrap(); + writer.del_items(wtxn, *dimensions, docid)?; + writer.add_items(wtxn, docid, &embeddings)?; + } + } + } + + Ok(()) +} + #[tracing::instrument(level = "trace", skip_all, target = "indexing::prefix")] fn compute_prefix_database( index: &Index, @@ -618,7 +770,7 @@ pub fn retrieve_or_guess_primary_key<'a>( index: &Index, new_fields_ids_map: &mut FieldsIdsMap, primary_key_from_op: Option<&'a str>, - first_document: Option>, + first_document: Option>, ) -> Result, bool), UserError>> { // make sure that we have a declared primary key, either fetching it from the index or attempting to guess it. diff --git a/crates/milli/src/update/new/indexer/partial_dump.rs b/crates/milli/src/update/new/indexer/partial_dump.rs index 2cc653813..6e4abd898 100644 --- a/crates/milli/src/update/new/indexer/partial_dump.rs +++ b/crates/milli/src/update/new/indexer/partial_dump.rs @@ -1,6 +1,8 @@ use std::ops::DerefMut; +use bumparaw_collections::RawMap; use rayon::iter::IndexedParallelIterator; +use rustc_hash::FxBuildHasher; use serde_json::value::RawValue; use super::document_changes::{DocumentChangeContext, DocumentChanges}; @@ -75,7 +77,7 @@ where self.primary_key.extract_fields_and_docid(document, fields_ids_map, doc_alloc)?; let external_document_id = external_document_id.to_de(); - let document = raw_collections::RawMap::from_raw_value(document, doc_alloc) + let document = RawMap::from_raw_value_and_hasher(document, FxBuildHasher, doc_alloc) .map_err(InternalError::SerdeJson)?; let insertion = Insertion::create(docid, external_document_id, Versions::single(document)); diff --git a/crates/milli/src/update/new/indexer/update_by_function.rs b/crates/milli/src/update/new/indexer/update_by_function.rs index a8e3e38a8..3001648e6 100644 --- a/crates/milli/src/update/new/indexer/update_by_function.rs +++ b/crates/milli/src/update/new/indexer/update_by_function.rs @@ -1,8 +1,9 @@ -use raw_collections::RawMap; +use bumparaw_collections::RawMap; use rayon::iter::IndexedParallelIterator; use rayon::slice::ParallelSlice as _; use rhai::{Dynamic, Engine, OptimizationLevel, Scope, AST}; use roaring::RoaringBitmap; +use rustc_hash::FxBuildHasher; use super::document_changes::DocumentChangeContext; use super::DocumentChanges; @@ -160,8 +161,12 @@ impl<'index> DocumentChanges<'index> for UpdateByFunctionChanges<'index> { if document_id != new_document_id { Err(Error::UserError(UserError::DocumentEditionCannotModifyPrimaryKey)) } else { - let raw_new_doc = RawMap::from_raw_value(raw_new_doc, doc_alloc) - .map_err(InternalError::SerdeJson)?; + let raw_new_doc = RawMap::from_raw_value_and_hasher( + raw_new_doc, + FxBuildHasher, + doc_alloc, + ) + .map_err(InternalError::SerdeJson)?; Ok(Some(DocumentChange::Update(Update::create( docid, diff --git a/crates/milli/src/update/new/merger.rs b/crates/milli/src/update/new/merger.rs index 039c56b9d..9e87388a2 100644 --- a/crates/milli/src/update/new/merger.rs +++ b/crates/milli/src/update/new/merger.rs @@ -9,8 +9,8 @@ use roaring::RoaringBitmap; use super::channel::*; use super::extract::{ - merge_caches, transpose_and_freeze_caches, BalancedCaches, DelAddRoaringBitmap, FacetKind, - GeoExtractorData, + merge_caches_sorted, transpose_and_freeze_caches, BalancedCaches, DelAddRoaringBitmap, + FacetKind, GeoExtractorData, }; use crate::{CboRoaringBitmapCodec, FieldId, GeoPoint, Index, InternalError, Result}; @@ -19,7 +19,7 @@ pub fn merge_and_send_rtree<'extractor, MSP>( datastore: impl IntoIterator>>, rtxn: &RoTxn, index: &Index, - geo_sender: GeoSender<'_>, + geo_sender: GeoSender<'_, '_>, must_stop_processing: &MSP, ) -> Result<()> where @@ -34,7 +34,7 @@ where } let mut frozen = data.into_inner().freeze()?; - for result in frozen.iter_and_clear_removed() { + for result in frozen.iter_and_clear_removed()? { let extracted_geo_point = result?; let removed = rtree.remove(&GeoPoint::from(extracted_geo_point)); debug_assert!(removed.is_some()); @@ -42,7 +42,7 @@ where debug_assert!(removed); } - for result in frozen.iter_and_clear_inserted() { + for result in frozen.iter_and_clear_inserted()? { let extracted_geo_point = result?; rtree.insert(GeoPoint::from(extracted_geo_point)); let inserted = faceted.insert(extracted_geo_point.docid); @@ -56,38 +56,37 @@ where let rtree_mmap = unsafe { Mmap::map(&file)? }; geo_sender.set_rtree(rtree_mmap).unwrap(); - geo_sender.set_geo_faceted(&faceted).unwrap(); + geo_sender.set_geo_faceted(&faceted)?; Ok(()) } #[tracing::instrument(level = "trace", skip_all, target = "indexing::merge")] -pub fn merge_and_send_docids<'extractor, MSP>( +pub fn merge_and_send_docids<'extractor, MSP, D>( mut caches: Vec>, database: Database, index: &Index, - docids_sender: impl DocidsSender + Sync, + docids_sender: WordDocidsSender, must_stop_processing: &MSP, ) -> Result<()> where MSP: Fn() -> bool + Sync, + D: DatabaseType + Sync, { transpose_and_freeze_caches(&mut caches)?.into_par_iter().try_for_each(|frozen| { let rtxn = index.read_txn()?; - let mut buffer = Vec::new(); if must_stop_processing() { return Err(InternalError::AbortedIndexation.into()); } - merge_caches(frozen, |key, DelAddRoaringBitmap { del, add }| { + merge_caches_sorted(frozen, |key, DelAddRoaringBitmap { del, add }| { let current = database.get(&rtxn, key)?; match merge_cbo_bitmaps(current, del, add)? { Operation::Write(bitmap) => { - let value = cbo_bitmap_serialize_into_vec(&bitmap, &mut buffer); - docids_sender.write(key, value).unwrap(); + docids_sender.write(key, &bitmap)?; Ok(()) } Operation::Delete => { - docids_sender.delete(key).unwrap(); + docids_sender.delete(key)?; Ok(()) } Operation::Ignore => Ok(()), @@ -101,26 +100,24 @@ pub fn merge_and_send_facet_docids<'extractor>( mut caches: Vec>, database: FacetDatabases, index: &Index, - docids_sender: impl DocidsSender + Sync, + docids_sender: FacetDocidsSender, ) -> Result { transpose_and_freeze_caches(&mut caches)? .into_par_iter() .map(|frozen| { let mut facet_field_ids_delta = FacetFieldIdsDelta::default(); let rtxn = index.read_txn()?; - let mut buffer = Vec::new(); - merge_caches(frozen, |key, DelAddRoaringBitmap { del, add }| { + merge_caches_sorted(frozen, |key, DelAddRoaringBitmap { del, add }| { let current = database.get_cbo_roaring_bytes_value(&rtxn, key)?; match merge_cbo_bitmaps(current, del, add)? { Operation::Write(bitmap) => { facet_field_ids_delta.register_from_key(key); - let value = cbo_bitmap_serialize_into_vec(&bitmap, &mut buffer); - docids_sender.write(key, value).unwrap(); + docids_sender.write(key, &bitmap)?; Ok(()) } Operation::Delete => { facet_field_ids_delta.register_from_key(key); - docids_sender.delete(key).unwrap(); + docids_sender.delete(key)?; Ok(()) } Operation::Ignore => Ok(()), @@ -238,8 +235,12 @@ fn merge_cbo_bitmaps( (Some(_current), None, None) => Ok(Operation::Ignore), // but it's strange (Some(current), None, Some(add)) => Ok(Operation::Write(current | add)), (Some(current), Some(del), add) => { + debug_assert!( + del.is_subset(¤t), + "del is not a subset of current, which must be impossible." + ); let output = match add { - Some(add) => (¤t - del) | add, + Some(add) => (¤t - (&del - &add)) | (add - del), None => ¤t - del, }; if output.is_empty() { @@ -252,10 +253,3 @@ fn merge_cbo_bitmaps( } } } - -/// TODO Return the slice directly from the serialize_into method -fn cbo_bitmap_serialize_into_vec<'b>(bitmap: &RoaringBitmap, buffer: &'b mut Vec) -> &'b [u8] { - buffer.clear(); - CboRoaringBitmapCodec::serialize_into(bitmap, buffer); - buffer.as_slice() -} diff --git a/crates/milli/src/update/new/ref_cell_ext.rs b/crates/milli/src/update/new/ref_cell_ext.rs index c66f4af0a..77f5fa800 100644 --- a/crates/milli/src/update/new/ref_cell_ext.rs +++ b/crates/milli/src/update/new/ref_cell_ext.rs @@ -5,6 +5,7 @@ pub trait RefCellExt { &self, ) -> std::result::Result, std::cell::BorrowMutError>; + #[track_caller] fn borrow_mut_or_yield(&self) -> RefMut<'_, T> { self.try_borrow_mut_or_yield().unwrap() } diff --git a/crates/milli/src/update/new/steps.rs b/crates/milli/src/update/new/steps.rs index 7c2441933..9eb7d376d 100644 --- a/crates/milli/src/update/new/steps.rs +++ b/crates/milli/src/update/new/steps.rs @@ -1,8 +1,12 @@ +use std::borrow::Cow; + use enum_iterator::Sequence; +use crate::progress::Step; + #[derive(Debug, Clone, Copy, PartialEq, Eq, Sequence)] -#[repr(u16)] -pub enum Step { +#[repr(u8)] +pub enum IndexingStep { PreparingPayloads, ExtractingDocuments, ExtractingFacets, @@ -11,37 +15,38 @@ pub enum Step { ExtractingEmbeddings, WritingGeoPoints, WritingToDatabase, - WritingEmbeddingsToDatabase, WaitingForExtractors, + WritingEmbeddingsToDatabase, PostProcessingFacets, PostProcessingWords, Finalizing, } -impl Step { - pub fn name(&self) -> &'static str { +impl Step for IndexingStep { + fn name(&self) -> Cow<'static, str> { match self { - Step::PreparingPayloads => "preparing update file", - Step::ExtractingDocuments => "extracting documents", - Step::ExtractingFacets => "extracting facets", - Step::ExtractingWords => "extracting words", - Step::ExtractingWordProximity => "extracting word proximity", - Step::ExtractingEmbeddings => "extracting embeddings", - Step::WritingGeoPoints => "writing geo points", - Step::WritingToDatabase => "writing to database", - Step::WritingEmbeddingsToDatabase => "writing embeddings to database", - Step::WaitingForExtractors => "waiting for extractors", - Step::PostProcessingFacets => "post-processing facets", - Step::PostProcessingWords => "post-processing words", - Step::Finalizing => "finalizing", + IndexingStep::PreparingPayloads => "preparing update file", + IndexingStep::ExtractingDocuments => "extracting documents", + IndexingStep::ExtractingFacets => "extracting facets", + IndexingStep::ExtractingWords => "extracting words", + IndexingStep::ExtractingWordProximity => "extracting word proximity", + IndexingStep::ExtractingEmbeddings => "extracting embeddings", + IndexingStep::WritingGeoPoints => "writing geo points", + IndexingStep::WritingToDatabase => "writing to database", + IndexingStep::WaitingForExtractors => "waiting for extractors", + IndexingStep::WritingEmbeddingsToDatabase => "writing embeddings to database", + IndexingStep::PostProcessingFacets => "post-processing facets", + IndexingStep::PostProcessingWords => "post-processing words", + IndexingStep::Finalizing => "finalizing", } + .into() } - pub fn finished_steps(self) -> u16 { - self as u16 + fn current(&self) -> u32 { + *self as u32 } - pub const fn total_steps() -> u16 { - Self::CARDINALITY as u16 + fn total(&self) -> u32 { + Self::CARDINALITY as u32 } } diff --git a/crates/milli/src/update/new/vector_document.rs b/crates/milli/src/update/new/vector_document.rs index 319730db0..8d14a749d 100644 --- a/crates/milli/src/update/new/vector_document.rs +++ b/crates/milli/src/update/new/vector_document.rs @@ -1,9 +1,10 @@ use std::collections::BTreeSet; use bumpalo::Bump; +use bumparaw_collections::RawMap; use deserr::{Deserr, IntoValue}; use heed::RoTxn; -use raw_collections::RawMap; +use rustc_hash::FxBuildHasher; use serde::Serialize; use serde_json::value::RawValue; @@ -84,7 +85,7 @@ pub struct VectorDocumentFromDb<'t> { docid: DocumentId, embedding_config: Vec, index: &'t Index, - vectors_field: Option>, + vectors_field: Option>, rtxn: &'t RoTxn<'t>, doc_alloc: &'t Bump, } @@ -102,9 +103,10 @@ impl<'t> VectorDocumentFromDb<'t> { }; let vectors = document.vectors_field()?; let vectors_field = match vectors { - Some(vectors) => { - Some(RawMap::from_raw_value(vectors, doc_alloc).map_err(InternalError::SerdeJson)?) - } + Some(vectors) => Some( + RawMap::from_raw_value_and_hasher(vectors, FxBuildHasher, doc_alloc) + .map_err(InternalError::SerdeJson)?, + ), None => None, }; @@ -220,7 +222,7 @@ fn entry_from_raw_value( pub struct VectorDocumentFromVersions<'doc> { external_document_id: &'doc str, - vectors: RawMap<'doc>, + vectors: RawMap<'doc, FxBuildHasher>, embedders: &'doc EmbeddingConfigs, } @@ -233,8 +235,8 @@ impl<'doc> VectorDocumentFromVersions<'doc> { ) -> Result> { let document = DocumentFromVersions::new(versions); if let Some(vectors_field) = document.vectors_field()? { - let vectors = - RawMap::from_raw_value(vectors_field, bump).map_err(UserError::SerdeJson)?; + let vectors = RawMap::from_raw_value_and_hasher(vectors_field, FxBuildHasher, bump) + .map_err(UserError::SerdeJson)?; Ok(Some(Self { external_document_id, vectors, embedders })) } else { Ok(None) diff --git a/crates/milli/src/update/new/word_fst_builder.rs b/crates/milli/src/update/new/word_fst_builder.rs index 2b1c4604b..a9a5222be 100644 --- a/crates/milli/src/update/new/word_fst_builder.rs +++ b/crates/milli/src/update/new/word_fst_builder.rs @@ -1,4 +1,4 @@ -use std::collections::HashSet; +use std::collections::BTreeSet; use std::io::BufWriter; use fst::{Set, SetBuilder, Streamer}; @@ -75,18 +75,18 @@ pub struct PrefixData { #[derive(Debug)] pub struct PrefixDelta { - pub modified: HashSet, - pub deleted: HashSet, + pub modified: BTreeSet, + pub deleted: BTreeSet, } struct PrefixFstBuilder { - prefix_count_threshold: u64, + prefix_count_threshold: usize, max_prefix_length: usize, /// TODO: Replace the full memory allocation prefix_fst_builders: Vec>>, current_prefix: Vec, - current_prefix_count: Vec, - modified_prefixes: HashSet, + current_prefix_count: Vec, + modified_prefixes: BTreeSet, current_prefix_is_modified: Vec, } @@ -95,7 +95,7 @@ impl PrefixFstBuilder { let PrefixSettings { prefix_count_threshold, max_prefix_length, compute_prefixes } = prefix_settings; - if !compute_prefixes { + if compute_prefixes != crate::index::PrefixSearch::IndexingTime { return None; } @@ -110,7 +110,7 @@ impl PrefixFstBuilder { prefix_fst_builders, current_prefix: vec![Prefix::new(); max_prefix_length], current_prefix_count: vec![0; max_prefix_length], - modified_prefixes: HashSet::new(), + modified_prefixes: BTreeSet::new(), current_prefix_is_modified: vec![false; max_prefix_length], }) } @@ -180,7 +180,7 @@ impl PrefixFstBuilder { let prefix_fst_mmap = unsafe { Mmap::map(&prefix_fst_file)? }; let new_prefix_fst = Set::new(&prefix_fst_mmap)?; let old_prefix_fst = index.words_prefixes_fst(rtxn)?; - let mut deleted_prefixes = HashSet::new(); + let mut deleted_prefixes = BTreeSet::new(); { let mut deleted_prefixes_stream = old_prefix_fst.op().add(&new_prefix_fst).difference(); while let Some(prefix) = deleted_prefixes_stream.next() { diff --git a/crates/milli/src/update/new/words_prefix_docids.rs b/crates/milli/src/update/new/words_prefix_docids.rs index 338d22505..bf64049c3 100644 --- a/crates/milli/src/update/new/words_prefix_docids.rs +++ b/crates/milli/src/update/new/words_prefix_docids.rs @@ -1,5 +1,5 @@ use std::cell::RefCell; -use std::collections::HashSet; +use std::collections::BTreeSet; use std::io::{BufReader, BufWriter, Read, Seek, Write}; use hashbrown::HashMap; @@ -37,8 +37,8 @@ impl WordPrefixDocids { fn execute( self, wtxn: &mut heed::RwTxn, - prefix_to_compute: &HashSet, - prefix_to_delete: &HashSet, + prefix_to_compute: &BTreeSet, + prefix_to_delete: &BTreeSet, ) -> Result<()> { delete_prefixes(wtxn, &self.prefix_database, prefix_to_delete)?; self.recompute_modified_prefixes(wtxn, prefix_to_compute) @@ -48,7 +48,7 @@ impl WordPrefixDocids { fn recompute_modified_prefixes( &self, wtxn: &mut RwTxn, - prefixes: &HashSet, + prefixes: &BTreeSet, ) -> Result<()> { // We fetch the docids associated to the newly added word prefix fst only. // And collect the CboRoaringBitmaps pointers in an HashMap. @@ -76,7 +76,7 @@ impl WordPrefixDocids { .union()?; buffer.clear(); - CboRoaringBitmapCodec::serialize_into(&output, buffer); + CboRoaringBitmapCodec::serialize_into_vec(&output, buffer); index.push(PrefixEntry { prefix, serialized_length: buffer.len() }); file.write_all(buffer) })?; @@ -127,7 +127,7 @@ impl<'a, 'rtxn> FrozenPrefixBitmaps<'a, 'rtxn> { pub fn from_prefixes( database: Database, rtxn: &'rtxn RoTxn, - prefixes: &'a HashSet, + prefixes: &'a BTreeSet, ) -> heed::Result { let database = database.remap_data_type::(); @@ -173,8 +173,8 @@ impl WordPrefixIntegerDocids { fn execute( self, wtxn: &mut heed::RwTxn, - prefix_to_compute: &HashSet, - prefix_to_delete: &HashSet, + prefix_to_compute: &BTreeSet, + prefix_to_delete: &BTreeSet, ) -> Result<()> { delete_prefixes(wtxn, &self.prefix_database, prefix_to_delete)?; self.recompute_modified_prefixes(wtxn, prefix_to_compute) @@ -184,7 +184,7 @@ impl WordPrefixIntegerDocids { fn recompute_modified_prefixes( &self, wtxn: &mut RwTxn, - prefixes: &HashSet, + prefixes: &BTreeSet, ) -> Result<()> { // We fetch the docids associated to the newly added word prefix fst only. // And collect the CboRoaringBitmaps pointers in an HashMap. @@ -211,7 +211,7 @@ impl WordPrefixIntegerDocids { .union()?; buffer.clear(); - CboRoaringBitmapCodec::serialize_into(&output, buffer); + CboRoaringBitmapCodec::serialize_into_vec(&output, buffer); index.push(PrefixIntegerEntry { prefix, pos, serialized_length: buffer.len() }); file.write_all(buffer)?; } @@ -262,7 +262,7 @@ impl<'a, 'rtxn> FrozenPrefixIntegerBitmaps<'a, 'rtxn> { pub fn from_prefixes( database: Database, rtxn: &'rtxn RoTxn, - prefixes: &'a HashSet, + prefixes: &'a BTreeSet, ) -> heed::Result { let database = database.remap_data_type::(); @@ -291,7 +291,7 @@ unsafe impl<'a, 'rtxn> Sync for FrozenPrefixIntegerBitmaps<'a, 'rtxn> {} fn delete_prefixes( wtxn: &mut RwTxn, prefix_database: &Database, - prefixes: &HashSet, + prefixes: &BTreeSet, ) -> Result<()> { // We remove all the entries that are no more required in this word prefix docids database. for prefix in prefixes { @@ -309,8 +309,8 @@ fn delete_prefixes( pub fn compute_word_prefix_docids( wtxn: &mut RwTxn, index: &Index, - prefix_to_compute: &HashSet, - prefix_to_delete: &HashSet, + prefix_to_compute: &BTreeSet, + prefix_to_delete: &BTreeSet, grenad_parameters: GrenadParameters, ) -> Result<()> { WordPrefixDocids::new( @@ -325,8 +325,8 @@ pub fn compute_word_prefix_docids( pub fn compute_exact_word_prefix_docids( wtxn: &mut RwTxn, index: &Index, - prefix_to_compute: &HashSet, - prefix_to_delete: &HashSet, + prefix_to_compute: &BTreeSet, + prefix_to_delete: &BTreeSet, grenad_parameters: GrenadParameters, ) -> Result<()> { WordPrefixDocids::new( @@ -341,8 +341,8 @@ pub fn compute_exact_word_prefix_docids( pub fn compute_word_prefix_fid_docids( wtxn: &mut RwTxn, index: &Index, - prefix_to_compute: &HashSet, - prefix_to_delete: &HashSet, + prefix_to_compute: &BTreeSet, + prefix_to_delete: &BTreeSet, grenad_parameters: GrenadParameters, ) -> Result<()> { WordPrefixIntegerDocids::new( @@ -357,8 +357,8 @@ pub fn compute_word_prefix_fid_docids( pub fn compute_word_prefix_position_docids( wtxn: &mut RwTxn, index: &Index, - prefix_to_compute: &HashSet, - prefix_to_delete: &HashSet, + prefix_to_compute: &BTreeSet, + prefix_to_delete: &BTreeSet, grenad_parameters: GrenadParameters, ) -> Result<()> { WordPrefixIntegerDocids::new( diff --git a/crates/milli/src/update/settings.rs b/crates/milli/src/update/settings.rs index ccfdb1711..3d2702479 100644 --- a/crates/milli/src/update/settings.rs +++ b/crates/milli/src/update/settings.rs @@ -17,7 +17,8 @@ use super::IndexerConfig; use crate::criterion::Criterion; use crate::error::UserError; use crate::index::{ - IndexEmbeddingConfig, DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS, + IndexEmbeddingConfig, PrefixSearch, DEFAULT_MIN_WORD_LEN_ONE_TYPO, + DEFAULT_MIN_WORD_LEN_TWO_TYPOS, }; use crate::order_by_map::OrderByMap; use crate::prompt::default_max_bytes; @@ -177,6 +178,8 @@ pub struct Settings<'a, 't, 'i> { embedder_settings: Setting>>, search_cutoff: Setting, localized_attributes_rules: Setting>, + prefix_search: Setting, + facet_search: Setting, } impl<'a, 't, 'i> Settings<'a, 't, 'i> { @@ -212,6 +215,8 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { embedder_settings: Setting::NotSet, search_cutoff: Setting::NotSet, localized_attributes_rules: Setting::NotSet, + prefix_search: Setting::NotSet, + facet_search: Setting::NotSet, indexer_config, } } @@ -418,6 +423,22 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { self.localized_attributes_rules = Setting::Reset; } + pub fn set_prefix_search(&mut self, value: PrefixSearch) { + self.prefix_search = Setting::Set(value); + } + + pub fn reset_prefix_search(&mut self) { + self.prefix_search = Setting::Reset; + } + + pub fn set_facet_search(&mut self, value: bool) { + self.facet_search = Setting::Set(value); + } + + pub fn reset_facet_search(&mut self) { + self.facet_search = Setting::Reset; + } + #[tracing::instrument( level = "trace" skip(self, progress_callback, should_abort, settings_diff), @@ -944,7 +965,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { false } else { self.index.put_proximity_precision(self.wtxn, new)?; - true + old.is_some() || new != ProximityPrecision::default() } } Setting::Reset => self.index.delete_proximity_precision(self.wtxn)?, @@ -954,6 +975,42 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { Ok(changed) } + fn update_prefix_search(&mut self) -> Result { + let changed = match self.prefix_search { + Setting::Set(new) => { + let old = self.index.prefix_search(self.wtxn)?; + if old == Some(new) { + false + } else { + self.index.put_prefix_search(self.wtxn, new)?; + old.is_some() || new != PrefixSearch::default() + } + } + Setting::Reset => self.index.delete_prefix_search(self.wtxn)?, + Setting::NotSet => false, + }; + + Ok(changed) + } + + fn update_facet_search(&mut self) -> Result { + let changed = match self.facet_search { + Setting::Set(new) => { + let old = self.index.facet_search(self.wtxn)?; + if old == new { + false + } else { + self.index.put_facet_search(self.wtxn, new)?; + true + } + } + Setting::Reset => self.index.delete_facet_search(self.wtxn)?, + Setting::NotSet => false, + }; + + Ok(changed) + } + fn update_embedding_configs(&mut self) -> Result> { match std::mem::take(&mut self.embedder_settings) { Setting::Set(configs) => self.update_embedding_configs_set(configs), @@ -1203,6 +1260,8 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { self.update_searchable()?; self.update_exact_attributes()?; self.update_proximity_precision()?; + self.update_prefix_search()?; + self.update_facet_search()?; self.update_localized_attributes_rules()?; let embedding_config_updates = self.update_embedding_configs()?; @@ -1282,6 +1341,7 @@ impl InnerIndexSettingsDiff { || old_settings.allowed_separators != new_settings.allowed_separators || old_settings.dictionary != new_settings.dictionary || old_settings.proximity_precision != new_settings.proximity_precision + || old_settings.prefix_search != new_settings.prefix_search || old_settings.localized_searchable_fields_ids != new_settings.localized_searchable_fields_ids }; @@ -1372,7 +1432,7 @@ impl InnerIndexSettingsDiff { } } - pub fn reindex_facets(&self) -> bool { + pub fn facet_fids_changed(&self) -> bool { let existing_fields = &self.new.existing_fields; if existing_fields.iter().any(|field| field.contains('.')) { return true; @@ -1392,7 +1452,15 @@ impl InnerIndexSettingsDiff { } (existing_fields - old_faceted_fields) != (existing_fields - new_faceted_fields) - || self.old.localized_faceted_fields_ids != self.new.localized_faceted_fields_ids + } + + pub fn global_facet_settings_changed(&self) -> bool { + self.old.localized_faceted_fields_ids != self.new.localized_faceted_fields_ids + || self.old.facet_search != self.new.facet_search + } + + pub fn reindex_facets(&self) -> bool { + self.facet_fids_changed() || self.global_facet_settings_changed() } pub fn reindex_vectors(&self) -> bool { @@ -1432,6 +1500,8 @@ pub(crate) struct InnerIndexSettings { pub non_faceted_fields_ids: Vec, pub localized_searchable_fields_ids: LocalizedFieldIds, pub localized_faceted_fields_ids: LocalizedFieldIds, + pub prefix_search: PrefixSearch, + pub facet_search: bool, } impl InnerIndexSettings { @@ -1457,6 +1527,8 @@ impl InnerIndexSettings { Some(embedding_configs) => embedding_configs, None => embedders(index.embedding_configs(rtxn)?)?, }; + let prefix_search = index.prefix_search(rtxn)?.unwrap_or_default(); + let facet_search = index.facet_search(rtxn)?; let existing_fields: HashSet<_> = index .field_distribution(rtxn)? .into_iter() @@ -1514,6 +1586,8 @@ impl InnerIndexSettings { non_faceted_fields_ids: vectors_fids.clone(), localized_searchable_fields_ids, localized_faceted_fields_ids, + prefix_search, + facet_search, }) } @@ -2721,6 +2795,8 @@ mod tests { embedder_settings, search_cutoff, localized_attributes_rules, + prefix_search, + facet_search, } = settings; assert!(matches!(searchable_fields, Setting::NotSet)); assert!(matches!(displayed_fields, Setting::NotSet)); @@ -2746,6 +2822,8 @@ mod tests { assert!(matches!(embedder_settings, Setting::NotSet)); assert!(matches!(search_cutoff, Setting::NotSet)); assert!(matches!(localized_attributes_rules, Setting::NotSet)); + assert!(matches!(prefix_search, Setting::NotSet)); + assert!(matches!(facet_search, Setting::NotSet)); }) .unwrap(); } diff --git a/crates/milli/src/update/words_prefixes_fst.rs b/crates/milli/src/update/words_prefixes_fst.rs index d47d6d14c..d18bfa74c 100644 --- a/crates/milli/src/update/words_prefixes_fst.rs +++ b/crates/milli/src/update/words_prefixes_fst.rs @@ -9,7 +9,7 @@ use crate::{Index, Result, SmallString32}; pub struct WordsPrefixesFst<'t, 'i> { wtxn: &'t mut RwTxn<'i>, index: &'i Index, - threshold: u32, + threshold: usize, max_prefix_length: usize, } @@ -24,8 +24,8 @@ impl<'t, 'i> WordsPrefixesFst<'t, 'i> { /// /// Default value is 100. This value must be higher than 50 and will be clamped /// to this bound otherwise. - pub fn threshold(&mut self, value: u32) -> &mut Self { - self.threshold = value.max(50); + pub fn threshold(&mut self, value: usize) -> &mut Self { + self.threshold = value; self } @@ -34,7 +34,7 @@ impl<'t, 'i> WordsPrefixesFst<'t, 'i> { /// Default value is `4` bytes. This value must be between 1 and 25 will be clamped /// to these bounds, otherwise. pub fn max_prefix_length(&mut self, value: usize) -> &mut Self { - self.max_prefix_length = value.clamp(1, 25); + self.max_prefix_length = value; self } diff --git a/crates/milli/src/vector/mod.rs b/crates/milli/src/vector/mod.rs index 3047e6dfc..a1d71ef93 100644 --- a/crates/milli/src/vector/mod.rs +++ b/crates/milli/src/vector/mod.rs @@ -475,7 +475,7 @@ impl Embeddings { Ok(()) } - /// Append a flat vector of embeddings a the end of the embeddings. + /// Append a flat vector of embeddings at the end of the embeddings. /// /// If `embeddings.len() % self.dimension != 0`, then the append operation fails. pub fn append(&mut self, mut embeddings: Vec) -> Result<(), Vec> { diff --git a/crates/milli/tests/search/facet_distribution.rs b/crates/milli/tests/search/facet_distribution.rs index 61d0697ff..ced81409d 100644 --- a/crates/milli/tests/search/facet_distribution.rs +++ b/crates/milli/tests/search/facet_distribution.rs @@ -3,6 +3,7 @@ use bumpalo::Bump; use heed::EnvOpenOptions; use maplit::hashset; use milli::documents::mmap_from_objects; +use milli::progress::Progress; use milli::update::new::indexer; use milli::update::{IndexDocumentsMethod, IndexerConfig, Settings}; use milli::vector::EmbeddingConfigs; @@ -57,13 +58,14 @@ fn test_facet_distribution_with_no_facet_values() { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -71,7 +73,7 @@ fn test_facet_distribution_with_no_facet_values() { &document_changes, embedders, &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); diff --git a/crates/milli/tests/search/mod.rs b/crates/milli/tests/search/mod.rs index 1287b59d5..30690969b 100644 --- a/crates/milli/tests/search/mod.rs +++ b/crates/milli/tests/search/mod.rs @@ -7,6 +7,7 @@ use bumpalo::Bump; use either::{Either, Left, Right}; use heed::EnvOpenOptions; use maplit::{btreemap, hashset}; +use milli::progress::Progress; use milli::update::new::indexer; use milli::update::{IndexDocumentsMethod, IndexerConfig, Settings}; use milli::vector::EmbeddingConfigs; @@ -90,7 +91,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); @@ -101,6 +102,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -108,7 +110,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { &document_changes, embedders, &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); diff --git a/crates/milli/tests/search/query_criteria.rs b/crates/milli/tests/search/query_criteria.rs index 3e56eeff0..304059915 100644 --- a/crates/milli/tests/search/query_criteria.rs +++ b/crates/milli/tests/search/query_criteria.rs @@ -5,6 +5,7 @@ use bumpalo::Bump; use heed::EnvOpenOptions; use itertools::Itertools; use maplit::hashset; +use milli::progress::Progress; use milli::update::new::indexer; use milli::update::{IndexDocumentsMethod, IndexerConfig, Settings}; use milli::vector::EmbeddingConfigs; @@ -326,13 +327,14 @@ fn criteria_ascdesc() { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -340,7 +342,7 @@ fn criteria_ascdesc() { &document_changes, embedders, &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); diff --git a/crates/milli/tests/search/typo_tolerance.rs b/crates/milli/tests/search/typo_tolerance.rs index 7ac9a1e4b..d33d79e54 100644 --- a/crates/milli/tests/search/typo_tolerance.rs +++ b/crates/milli/tests/search/typo_tolerance.rs @@ -3,6 +3,7 @@ use std::collections::BTreeSet; use bumpalo::Bump; use heed::EnvOpenOptions; use milli::documents::mmap_from_objects; +use milli::progress::Progress; use milli::update::new::indexer; use milli::update::{IndexDocumentsMethod, IndexerConfig, Settings}; use milli::vector::EmbeddingConfigs; @@ -135,13 +136,14 @@ fn test_typo_disabled_on_word() { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -149,7 +151,7 @@ fn test_typo_disabled_on_word() { &document_changes, embedders, &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); diff --git a/crates/xtask/src/bench/meili_process.rs b/crates/xtask/src/bench/meili_process.rs index 99f6f4ea6..2aff679fc 100644 --- a/crates/xtask/src/bench/meili_process.rs +++ b/crates/xtask/src/bench/meili_process.rs @@ -1,23 +1,56 @@ use std::collections::BTreeMap; +use std::time::Duration; use anyhow::{bail, Context as _}; +use tokio::process::Command; +use tokio::time; use super::assets::Asset; use super::client::Client; use super::workload::Workload; pub async fn kill(mut meilisearch: tokio::process::Child) { - if let Err(error) = meilisearch.kill().await { - tracing::warn!( - error = &error as &dyn std::error::Error, - "while terminating Meilisearch server" - ) + let Some(id) = meilisearch.id() else { return }; + + match Command::new("kill").args(["--signal=TERM", &id.to_string()]).spawn() { + Ok(mut cmd) => { + let Err(error) = cmd.wait().await else { return }; + tracing::warn!( + error = &error as &dyn std::error::Error, + "while awaiting the Meilisearch server kill" + ); + } + Err(error) => { + tracing::warn!( + error = &error as &dyn std::error::Error, + "while terminating Meilisearch server with a kill -s TERM" + ); + if let Err(error) = meilisearch.kill().await { + tracing::warn!( + error = &error as &dyn std::error::Error, + "while terminating Meilisearch server" + ) + } + return; + } + }; + + match time::timeout(Duration::from_secs(5), meilisearch.wait()).await { + Ok(_) => (), + Err(_) => { + if let Err(error) = meilisearch.kill().await { + tracing::warn!( + error = &error as &dyn std::error::Error, + "while terminating Meilisearch server" + ) + } + } } } #[tracing::instrument] pub async fn build() -> anyhow::Result<()> { - let mut command = tokio::process::Command::new("cargo"); + let mut command = Command::new("cargo"); command.arg("build").arg("--release").arg("-p").arg("meilisearch"); command.kill_on_drop(true); @@ -37,17 +70,8 @@ pub async fn start( master_key: Option<&str>, workload: &Workload, asset_folder: &str, + mut command: Command, ) -> anyhow::Result { - let mut command = tokio::process::Command::new("cargo"); - command - .arg("run") - .arg("--release") - .arg("-p") - .arg("meilisearch") - .arg("--bin") - .arg("meilisearch") - .arg("--"); - command.arg("--db-path").arg("./_xtask_benchmark.ms"); if let Some(master_key) = master_key { command.arg("--master-key").arg(master_key); @@ -86,7 +110,7 @@ async fn wait_for_health( return Ok(()); } - tokio::time::sleep(std::time::Duration::from_millis(500)).await; + time::sleep(Duration::from_millis(500)).await; // check whether the Meilisearch instance exited early (cut the wait) if let Some(exit_code) = meilisearch.try_wait().context("cannot check Meilisearch server process status")? diff --git a/crates/xtask/src/bench/mod.rs b/crates/xtask/src/bench/mod.rs index fdb2c4963..1416c21d9 100644 --- a/crates/xtask/src/bench/mod.rs +++ b/crates/xtask/src/bench/mod.rs @@ -82,6 +82,16 @@ pub struct BenchDeriveArgs { /// Reason for the benchmark invocation #[arg(short, long)] reason: Option, + + /// The maximum time in seconds we allow for fetching the task queue before timing out. + #[arg(long, default_value_t = 60)] + tasks_queue_timeout_secs: u64, + + /// The path to the binary to run. + /// + /// If unspecified, runs `cargo run` after building Meilisearch with `cargo build`. + #[arg(long)] + binary_path: Option, } pub fn run(args: BenchDeriveArgs) -> anyhow::Result<()> { @@ -127,7 +137,7 @@ pub fn run(args: BenchDeriveArgs) -> anyhow::Result<()> { let meili_client = Client::new( Some("http://127.0.0.1:7700".into()), args.master_key.as_deref(), - Some(std::time::Duration::from_secs(60)), + Some(std::time::Duration::from_secs(args.tasks_queue_timeout_secs)), )?; // enter runtime @@ -135,7 +145,7 @@ pub fn run(args: BenchDeriveArgs) -> anyhow::Result<()> { rt.block_on(async { dashboard_client.send_machine_info(&env).await?; - let commit_message = build_info.commit_msg.context("missing commit message")?.split('\n').next().unwrap(); + let commit_message = build_info.commit_msg.unwrap_or_default().split('\n').next().unwrap(); let max_workloads = args.workload_file.len(); let reason: Option<&str> = args.reason.as_deref(); let invocation_uuid = dashboard_client.create_invocation(build_info.clone(), commit_message, env, max_workloads, reason).await?; @@ -166,6 +176,7 @@ pub fn run(args: BenchDeriveArgs) -> anyhow::Result<()> { args.master_key.as_deref(), workload, &args, + args.binary_path.as_deref(), ) .await?; diff --git a/crates/xtask/src/bench/workload.rs b/crates/xtask/src/bench/workload.rs index 19c8bfae8..39119428f 100644 --- a/crates/xtask/src/bench/workload.rs +++ b/crates/xtask/src/bench/workload.rs @@ -1,6 +1,7 @@ use std::collections::BTreeMap; use std::fs::File; use std::io::{Seek as _, Write as _}; +use std::path::Path; use anyhow::{bail, Context as _}; use futures_util::TryStreamExt as _; @@ -85,13 +86,13 @@ pub async fn execute( master_key: Option<&str>, workload: Workload, args: &BenchDeriveArgs, + binary_path: Option<&Path>, ) -> anyhow::Result<()> { assets::fetch_assets(assets_client, &workload.assets, &args.asset_folder).await?; let workload_uuid = dashboard_client.create_workload(invocation_uuid, &workload).await?; let mut tasks = Vec::new(); - for i in 0..workload.run_count { tasks.push( execute_run( @@ -102,6 +103,7 @@ pub async fn execute( master_key, &workload, args, + binary_path, i, ) .await?, @@ -109,7 +111,6 @@ pub async fn execute( } let mut reports = Vec::with_capacity(workload.run_count as usize); - for task in tasks { reports.push( task.await @@ -133,13 +134,31 @@ async fn execute_run( master_key: Option<&str>, workload: &Workload, args: &BenchDeriveArgs, + binary_path: Option<&Path>, run_number: u16, ) -> anyhow::Result>> { meili_process::delete_db(); - meili_process::build().await?; + let run_command = match binary_path { + Some(binary_path) => tokio::process::Command::new(binary_path), + None => { + meili_process::build().await?; + let mut command = tokio::process::Command::new("cargo"); + command + .arg("run") + .arg("--release") + .arg("-p") + .arg("meilisearch") + .arg("--bin") + .arg("meilisearch") + .arg("--"); + command + } + }; + let meilisearch = - meili_process::start(meili_client, master_key, workload, &args.asset_folder).await?; + meili_process::start(meili_client, master_key, workload, &args.asset_folder, run_command) + .await?; let processor = run_commands( dashboard_client, diff --git a/crates/xtask/src/main.rs b/crates/xtask/src/main.rs index b81424666..942362f4f 100644 --- a/crates/xtask/src/main.rs +++ b/crates/xtask/src/main.rs @@ -16,6 +16,7 @@ struct ListFeaturesDeriveArgs { #[command(author, version, about, long_about)] #[command(name = "cargo xtask")] #[command(bin_name = "cargo xtask")] +#[allow(clippy::large_enum_variant)] // please, that's enough... enum Command { ListFeatures(ListFeaturesDeriveArgs), Bench(BenchDeriveArgs), diff --git a/workloads/hackernews-add-new-documents.json b/workloads/hackernews-add-new-documents.json new file mode 100644 index 000000000..0470a0792 --- /dev/null +++ b/workloads/hackernews-add-new-documents.json @@ -0,0 +1,105 @@ +{ + "name": "hackernews.add_new_documents", + "run_count": 3, + "extra_cli_args": [], + "assets": { + "hackernews-01.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/01.ndjson", + "sha256": "cd3627b86c064d865b6754848ed0e73ef1d8142752a25e5f0765c3a1296dd3ae" + }, + "hackernews-02.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/02.ndjson", + "sha256": "5d533b83bcf992201dace88b4d0c0be8b4df5225c6c4b763582d986844bcc23b" + }, + "hackernews-03.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/03.ndjson", + "sha256": "f5f351a0d04a8a83643ace12cafa2b7ec8ca8cb7d46fd268e5126492a6c66f2a" + }, + "hackernews-04.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/04.ndjson", + "sha256": "ac1915ee7ce53a6718548c255a6cc59969784b2570745dc5b739f714beda291a" + }, + "hackernews-05.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/05.ndjson", + "sha256": "be31d5632602f798e62d1c10c83bdfda2b4deaa068477eacde05fdd247572b82" + } + }, + "precommands": [ + { + "route": "indexes/movies/settings", + "method": "PATCH", + "body": { + "inline": { + "displayedAttributes": [ + "title", + "by", + "score", + "time", + "text" + ], + "searchableAttributes": [ + "title", + "text" + ], + "filterableAttributes": [ + "by", + "kids", + "parent" + ], + "sortableAttributes": [ + "score", + "time" + ] + } + }, + "synchronous": "WaitForTask" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-01.ndjson" + }, + "synchronous": "WaitForResponse" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-02.ndjson" + }, + "synchronous": "WaitForResponse" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-03.ndjson" + }, + "synchronous": "WaitForResponse" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-04.ndjson" + }, + "synchronous": "WaitForTask" + } + ], + "commands": [ + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-05.ndjson" + }, + "synchronous": "WaitForTask" + } + ] +} diff --git a/workloads/hackernews-modify-facet-numbers.json b/workloads/hackernews-modify-facet-numbers.json new file mode 100644 index 000000000..f4171442f --- /dev/null +++ b/workloads/hackernews-modify-facet-numbers.json @@ -0,0 +1,111 @@ +{ + "name": "hackernews.modify_facet_numbers", + "run_count": 3, + "extra_cli_args": [], + "assets": { + "hackernews-01.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/01.ndjson", + "sha256": "cd3627b86c064d865b6754848ed0e73ef1d8142752a25e5f0765c3a1296dd3ae" + }, + "hackernews-02.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/02.ndjson", + "sha256": "5d533b83bcf992201dace88b4d0c0be8b4df5225c6c4b763582d986844bcc23b" + }, + "hackernews-03.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/03.ndjson", + "sha256": "f5f351a0d04a8a83643ace12cafa2b7ec8ca8cb7d46fd268e5126492a6c66f2a" + }, + "hackernews-04.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/04.ndjson", + "sha256": "ac1915ee7ce53a6718548c255a6cc59969784b2570745dc5b739f714beda291a" + }, + "hackernews-05.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/05.ndjson", + "sha256": "be31d5632602f798e62d1c10c83bdfda2b4deaa068477eacde05fdd247572b82" + }, + "hackernews-02-modified-filters.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/02-modified-filters.ndjson", + "sha256": "7272cbfd41110d32d7fe168424a0000f07589bfe40f664652b34f4f20aaf3802" + } + }, + "precommands": [ + { + "route": "indexes/movies/settings", + "method": "PATCH", + "body": { + "inline": { + "displayedAttributes": [ + "title", + "by", + "score", + "time", + "text" + ], + "searchableAttributes": [ + "title", + "text" + ], + "filterableAttributes": [ + "by", + "kids", + "parent" + ], + "sortableAttributes": [ + "score", + "time" + ] + } + }, + "synchronous": "WaitForTask" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-01.ndjson" + }, + "synchronous": "WaitForResponse" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-02.ndjson" + }, + "synchronous": "WaitForResponse" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-03.ndjson" + }, + "synchronous": "WaitForResponse" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-04.ndjson" + }, + "synchronous": "WaitForTask" + } + ], + "commands": [ + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-02-modified-filters.ndjson" + }, + "synchronous": "WaitForTask" + } + ] +} + \ No newline at end of file diff --git a/workloads/hackernews-modify-facet-strings.json b/workloads/hackernews-modify-facet-strings.json new file mode 100644 index 000000000..7c5eb2e70 --- /dev/null +++ b/workloads/hackernews-modify-facet-strings.json @@ -0,0 +1,111 @@ +{ + "name": "hackernews.modify_facet_strings", + "run_count": 3, + "extra_cli_args": [], + "assets": { + "hackernews-01.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/01.ndjson", + "sha256": "cd3627b86c064d865b6754848ed0e73ef1d8142752a25e5f0765c3a1296dd3ae" + }, + "hackernews-02.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/02.ndjson", + "sha256": "5d533b83bcf992201dace88b4d0c0be8b4df5225c6c4b763582d986844bcc23b" + }, + "hackernews-03.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/03.ndjson", + "sha256": "f5f351a0d04a8a83643ace12cafa2b7ec8ca8cb7d46fd268e5126492a6c66f2a" + }, + "hackernews-04.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/04.ndjson", + "sha256": "ac1915ee7ce53a6718548c255a6cc59969784b2570745dc5b739f714beda291a" + }, + "hackernews-05.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/05.ndjson", + "sha256": "be31d5632602f798e62d1c10c83bdfda2b4deaa068477eacde05fdd247572b82" + }, + "hackernews-01-modified-filters.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/01-modified-filters.ndjson", + "sha256": "b80c245ce1b1df80b9b38800f677f3bd11947ebc62716fb108269d50e796c35c" + } + }, + "precommands": [ + { + "route": "indexes/movies/settings", + "method": "PATCH", + "body": { + "inline": { + "displayedAttributes": [ + "title", + "by", + "score", + "time", + "text" + ], + "searchableAttributes": [ + "title", + "text" + ], + "filterableAttributes": [ + "by", + "kids", + "parent" + ], + "sortableAttributes": [ + "score", + "time" + ] + } + }, + "synchronous": "WaitForTask" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-01.ndjson" + }, + "synchronous": "WaitForResponse" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-02.ndjson" + }, + "synchronous": "WaitForResponse" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-03.ndjson" + }, + "synchronous": "WaitForResponse" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-04.ndjson" + }, + "synchronous": "WaitForTask" + } + ], + "commands": [ + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-01-modified-filters.ndjson" + }, + "synchronous": "WaitForTask" + } + ] +} + \ No newline at end of file diff --git a/workloads/hackernews-modify-searchables.json b/workloads/hackernews-modify-searchables.json new file mode 100644 index 000000000..248026f19 --- /dev/null +++ b/workloads/hackernews-modify-searchables.json @@ -0,0 +1,123 @@ +{ + "name": "hackernews.modify_searchables", + "run_count": 3, + "extra_cli_args": [], + "assets": { + "hackernews-01.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/01.ndjson", + "sha256": "cd3627b86c064d865b6754848ed0e73ef1d8142752a25e5f0765c3a1296dd3ae" + }, + "hackernews-02.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/02.ndjson", + "sha256": "5d533b83bcf992201dace88b4d0c0be8b4df5225c6c4b763582d986844bcc23b" + }, + "hackernews-03.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/03.ndjson", + "sha256": "f5f351a0d04a8a83643ace12cafa2b7ec8ca8cb7d46fd268e5126492a6c66f2a" + }, + "hackernews-04.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/04.ndjson", + "sha256": "ac1915ee7ce53a6718548c255a6cc59969784b2570745dc5b739f714beda291a" + }, + "hackernews-05.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/05.ndjson", + "sha256": "be31d5632602f798e62d1c10c83bdfda2b4deaa068477eacde05fdd247572b82" + }, + "hackernews-01-modified-searchables.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/01-modified-searchables.ndjson", + "sha256": "e5c08710c6af70031ac7212e0ba242c72ef29c8d4e1fce66c789544641452a7c" + }, + "hackernews-02-modified-searchables.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/02-modified-searchables.ndjson", + "sha256": "098b029851117087b1e26ccb7ac408eda9bba54c3008213a2880d6fab607346e" + } + }, + "precommands": [ + { + "route": "indexes/movies/settings", + "method": "PATCH", + "body": { + "inline": { + "displayedAttributes": [ + "title", + "by", + "score", + "time", + "text" + ], + "searchableAttributes": [ + "title", + "text" + ], + "filterableAttributes": [ + "by", + "kids", + "parent" + ], + "sortableAttributes": [ + "score", + "time" + ] + } + }, + "synchronous": "WaitForTask" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-01.ndjson" + }, + "synchronous": "WaitForResponse" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-02.ndjson" + }, + "synchronous": "WaitForResponse" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-03.ndjson" + }, + "synchronous": "WaitForResponse" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-04.ndjson" + }, + "synchronous": "WaitForTask" + } + ], + "commands": [ + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-01-modified-searchables.ndjson" + }, + "synchronous": "WaitForTask" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-02-modified-searchables.ndjson" + }, + "synchronous": "WaitForTask" + } + ] +}