diff --git a/Cargo.lock b/Cargo.lock index a407244b1..ed6d0c291 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -380,6 +380,24 @@ dependencies = [ "derive_arbitrary", ] +[[package]] +name = "arroy" +version = "0.1.0" +source = "git+https://github.com/meilisearch/arroy.git#4b59476f457e5443ff250ea10d40d8b66a692674" +dependencies = [ + "bytemuck", + "byteorder", + "heed", + "log", + "memmap2 0.9.0", + "ordered-float 4.2.0", + "rand", + "rayon", + "roaring", + "tempfile", + "thiserror", +] + [[package]] name = "assert-json-diff" version = "2.0.2" @@ -537,9 +555,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] name = "bitflags" -version = "2.3.3" +version = "2.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "630be753d4e58660abd17930c71b647fe46c27ea6b63cc59e1e3851406972e42" +checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07" dependencies = [ "serde", ] @@ -629,9 +647,9 @@ dependencies = [ [[package]] name = "byteorder" -version = "1.4.3" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "bytes" @@ -687,7 +705,7 @@ dependencies = [ "byteorder", "gemm", "half 2.3.1", - "memmap2", + "memmap2 0.7.1", "num-traits", "num_cpus", "rand", @@ -1561,23 +1579,12 @@ checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" [[package]] name = "errno" -version = "0.3.2" +version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6b30f669a7961ef1631673d2766cc92f52d64f7ef354d4fe0ddfd30ed52f0f4f" +checksum = "a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245" dependencies = [ - "errno-dragonfly", - "libc", - "windows-sys 0.48.0", -] - -[[package]] -name = "errno-dragonfly" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf" -dependencies = [ - "cc", "libc", + "windows-sys 0.52.0", ] [[package]] @@ -2117,7 +2124,7 @@ version = "0.20.0-alpha.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9648a50991c86df7d00c56c268c27754fcf4c80be2ba57fc4a00dc928c6fe934" dependencies = [ - "bitflags 2.3.3", + "bitflags 2.4.1", "bytemuck", "byteorder", "heed-traits", @@ -2868,7 +2875,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8c619cdaa30bb84088963968bee12a45ea5fbbf355f2c021bcd15589f5ca494a" dependencies = [ "num_cpus", - "ordered-float", + "ordered-float 3.7.0", "parking_lot", "rand", "rayon", @@ -2911,7 +2918,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cb0889898416213fab133e1d33a0e5858a48177452750691bde3666d0fdbaf8b" dependencies = [ "hermit-abi", - "rustix 0.38.7", + "rustix 0.38.26", "windows-sys 0.48.0", ] @@ -3294,9 +3301,9 @@ checksum = "f051f77a7c8e6957c0696eac88f26b0117e54f52d3fc682ab19397a8812846a4" [[package]] name = "linux-raw-sys" -version = "0.4.5" +version = "0.4.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57bcfdad1b858c2db7c38303a6d2ad4dfaf5eb53dfeb0910128b2c26d6158503" +checksum = "c4cd1a83af159aa67994778be9070f0ae1bd732942279cabb14f86f986a21456" [[package]] name = "liquid" @@ -3412,9 +3419,9 @@ dependencies = [ [[package]] name = "log" -version = "0.4.19" +version = "0.4.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b06a4cde4c0f271a446782e3eff8de789548ce57dbc8eca9292c27f4a42004b4" +checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" [[package]] name = "logging_timer" @@ -3543,7 +3550,7 @@ dependencies = [ "num_cpus", "obkv", "once_cell", - "ordered-float", + "ordered-float 3.7.0", "parking_lot", "permissive-json-pointer", "pin-project-lite", @@ -3618,7 +3625,7 @@ dependencies = [ "fst", "insta", "meili-snap", - "memmap2", + "memmap2 0.7.1", "milli", "roaring", "serde", @@ -3662,6 +3669,15 @@ dependencies = [ "stable_deref_trait", ] +[[package]] +name = "memmap2" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "deaba38d7abf1d4cca21cc89e932e542ba2b9258664d2a9ef0e61512039c9375" +dependencies = [ + "libc", +] + [[package]] name = "memoffset" version = "0.9.0" @@ -3675,6 +3691,7 @@ dependencies = [ name = "milli" version = "1.5.1" dependencies = [ + "arroy", "big_s", "bimap", "bincode", @@ -3711,12 +3728,12 @@ dependencies = [ "maplit", "md5", "meili-snap", - "memmap2", + "memmap2 0.7.1", "mimalloc", "nolife", "obkv", "once_cell", - "ordered-float", + "ordered-float 3.7.0", "puffin", "rand", "rand_pcg", @@ -3983,7 +4000,7 @@ version = "0.10.59" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7a257ad03cd8fb16ad4172fedf8094451e1af1c4b70097636ef2eac9a5f0cc33" dependencies = [ - "bitflags 2.3.3", + "bitflags 2.4.1", "cfg-if", "foreign-types", "libc", @@ -4036,6 +4053,15 @@ dependencies = [ "num-traits", ] +[[package]] +name = "ordered-float" +version = "4.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a76df7075c7d4d01fdcb46c912dd17fba5b60c78ea480b475f2b6ab6f666584e" +dependencies = [ + "num-traits", +] + [[package]] name = "page_size" version = "0.5.0" @@ -4553,6 +4579,15 @@ dependencies = [ "bitflags 1.3.2", ] +[[package]] +name = "redox_syscall" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa" +dependencies = [ + "bitflags 1.3.2", +] + [[package]] name = "redox_users" version = "0.4.3" @@ -4736,15 +4771,15 @@ dependencies = [ [[package]] name = "rustix" -version = "0.38.7" +version = "0.38.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "172891ebdceb05aa0005f533a6cbfca599ddd7d966f6f5d4d9b2e70478e70399" +checksum = "9470c4bf8246c8daf25f9598dca807fb6510347b1e1cfa55749113850c79d88a" dependencies = [ - "bitflags 2.3.3", + "bitflags 2.4.1", "errno", "libc", - "linux-raw-sys 0.4.5", - "windows-sys 0.48.0", + "linux-raw-sys 0.4.12", + "windows-sys 0.52.0", ] [[package]] @@ -5279,14 +5314,14 @@ dependencies = [ [[package]] name = "tempfile" -version = "3.7.1" +version = "3.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc02fddf48964c42031a0b3fe0428320ecf3a73c401040fc0096f97794310651" +checksum = "7ef1adac450ad7f4b3c28589471ade84f25f731a7a0fe30d71dfa9f60fd808e5" dependencies = [ "cfg-if", "fastrand", - "redox_syscall 0.3.5", - "rustix 0.38.7", + "redox_syscall 0.4.1", + "rustix 0.38.26", "windows-sys 0.48.0", ] @@ -5997,6 +6032,15 @@ dependencies = [ "windows-targets 0.48.1", ] +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets 0.52.0", +] + [[package]] name = "windows-targets" version = "0.42.2" @@ -6027,6 +6071,21 @@ dependencies = [ "windows_x86_64_msvc 0.48.0", ] +[[package]] +name = "windows-targets" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a18201040b24831fbb9e4eb208f8892e1f50a37feb53cc7ff887feb8f50e7cd" +dependencies = [ + "windows_aarch64_gnullvm 0.52.0", + "windows_aarch64_msvc 0.52.0", + "windows_i686_gnu 0.52.0", + "windows_i686_msvc 0.52.0", + "windows_x86_64_gnu 0.52.0", + "windows_x86_64_gnullvm 0.52.0", + "windows_x86_64_msvc 0.52.0", +] + [[package]] name = "windows_aarch64_gnullvm" version = "0.42.2" @@ -6039,6 +6098,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc" +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb7764e35d4db8a7921e09562a0304bf2f93e0a51bfccee0bd0bb0b666b015ea" + [[package]] name = "windows_aarch64_msvc" version = "0.42.2" @@ -6051,6 +6116,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3" +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbaa0368d4f1d2aaefc55b6fcfee13f41544ddf36801e793edbbfd7d7df075ef" + [[package]] name = "windows_i686_gnu" version = "0.42.2" @@ -6063,6 +6134,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241" +[[package]] +name = "windows_i686_gnu" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a28637cb1fa3560a16915793afb20081aba2c92ee8af57b4d5f28e4b3e7df313" + [[package]] name = "windows_i686_msvc" version = "0.42.2" @@ -6075,6 +6152,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00" +[[package]] +name = "windows_i686_msvc" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ffe5e8e31046ce6230cc7215707b816e339ff4d4d67c65dffa206fd0f7aa7b9a" + [[package]] name = "windows_x86_64_gnu" version = "0.42.2" @@ -6087,6 +6170,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1" +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d6fa32db2bc4a2f5abeacf2b69f7992cd09dca97498da74a151a3132c26befd" + [[package]] name = "windows_x86_64_gnullvm" version = "0.42.2" @@ -6099,6 +6188,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953" +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a657e1e9d3f514745a572a6846d3c7aa7dbe1658c056ed9c3344c4109a6949e" + [[package]] name = "windows_x86_64_msvc" version = "0.42.2" @@ -6111,6 +6206,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a" +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dff9641d1cd4be8d1a070daf9e3773c5f67e78b4d9d42263020c057706765c04" + [[package]] name = "winnow" version = "0.5.4" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 38931ca0f..0aee03b2f 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -89,6 +89,8 @@ reqwest = { version = "0.11.16", features = [ ], default-features = false } tiktoken-rs = "0.5.7" liquid = "0.26.4" +arroy = { git = "https://github.com/meilisearch/arroy.git", version = "0.1.0" } +rand = "0.8.5" [dev-dependencies] mimalloc = { version = "0.1.37", default-features = false } @@ -100,15 +102,7 @@ meili-snap = { path = "../meili-snap" } rand = { version = "0.8.5", features = ["small_rng"] } [features] -all-tokenizations = [ - "charabia/chinese", - "charabia/hebrew", - "charabia/japanese", - "charabia/thai", - "charabia/korean", - "charabia/greek", - "charabia/khmer", -] +all-tokenizations = ["charabia/chinese", "charabia/hebrew", "charabia/japanese", "charabia/thai", "charabia/korean", "charabia/greek", "charabia/khmer"] # Use POSIX semaphores instead of SysV semaphores in LMDB # For more information on this feature, see heed's Cargo.toml diff --git a/milli/src/index.rs b/milli/src/index.rs index 307d87906..c494f2f2b 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -22,7 +22,6 @@ use crate::heed_codec::{ BEU16StrCodec, FstSetCodec, ScriptLanguageCodec, StrBEU16Codec, StrRefCodec, }; use crate::proximity::ProximityPrecision; -use crate::readable_slices::ReadableSlices; use crate::vector::EmbeddingConfig; use crate::{ default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds, @@ -49,10 +48,6 @@ pub mod main_key { pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map"; pub const GEO_FACETED_DOCUMENTS_IDS_KEY: &str = "geo-faceted-documents-ids"; pub const GEO_RTREE_KEY: &str = "geo-rtree"; - /// The prefix of the key that is used to store the, potential big, HNSW structure. - /// It is concatenated with a big-endian encoded number (non-human readable). - /// e.g. vector-hnsw0x0032. - pub const VECTOR_HNSW_KEY_PREFIX: &str = "vector-hnsw"; pub const PRIMARY_KEY_KEY: &str = "primary-key"; pub const SEARCHABLE_FIELDS_KEY: &str = "searchable-fields"; pub const USER_DEFINED_SEARCHABLE_FIELDS_KEY: &str = "user-defined-searchable-fields"; @@ -75,6 +70,7 @@ pub mod main_key { pub const SORT_FACET_VALUES_BY: &str = "sort-facet-values-by"; pub const PAGINATION_MAX_TOTAL_HITS: &str = "pagination-max-total-hits"; pub const PROXIMITY_PRECISION: &str = "proximity-precision"; + pub const VECTOR_UNAVAILABLE_VECTOR_IDS: &str = "vector-unavailable-vector-ids"; pub const EMBEDDING_CONFIGS: &str = "embedding_configs"; } @@ -102,6 +98,9 @@ pub mod db_name { pub const FIELD_ID_DOCID_FACET_F64S: &str = "field-id-docid-facet-f64s"; pub const FIELD_ID_DOCID_FACET_STRINGS: &str = "field-id-docid-facet-strings"; pub const VECTOR_ID_DOCID: &str = "vector-id-docids"; + pub const VECTOR_DOCID_IDS: &str = "vector-docid-ids"; + pub const VECTOR_EMBEDDER_CATEGORY_ID: &str = "vector-embedder-category-id"; + pub const VECTOR_ARROY: &str = "vector-arroy"; pub const DOCUMENTS: &str = "documents"; pub const SCRIPT_LANGUAGE_DOCIDS: &str = "script_language_docids"; } @@ -168,8 +167,16 @@ pub struct Index { /// Maps the document id, the facet field id and the strings. pub field_id_docid_facet_strings: Database, - /// Maps a vector id to the document id that have it. + /// Maps a vector id to its document id. pub vector_id_docid: Database, + /// Maps a doc id to its vector ids. + pub docid_vector_ids: Database, + + /// Maps an embedder name to its id in the arroy store. + pub embedder_category_id: Database, + + /// Vector store based on arroyâ„¢. + pub vector_arroy: arroy::Database, /// Maps the document id to the document as an obkv store. pub(crate) documents: Database, @@ -184,7 +191,7 @@ impl Index { ) -> Result { use db_name::*; - options.max_dbs(24); + options.max_dbs(27); let env = options.open(path)?; let mut wtxn = env.write_txn()?; @@ -224,7 +231,13 @@ impl Index { env.create_database(&mut wtxn, Some(FIELD_ID_DOCID_FACET_F64S))?; let field_id_docid_facet_strings = env.create_database(&mut wtxn, Some(FIELD_ID_DOCID_FACET_STRINGS))?; + // vector stuff let vector_id_docid = env.create_database(&mut wtxn, Some(VECTOR_ID_DOCID))?; + let docid_vector_ids = env.create_database(&mut wtxn, Some(VECTOR_DOCID_IDS))?; + let embedder_category_id = + env.create_database(&mut wtxn, Some(VECTOR_EMBEDDER_CATEGORY_ID))?; + let vector_arroy = env.create_database(&mut wtxn, Some(VECTOR_ARROY))?; + let documents = env.create_database(&mut wtxn, Some(DOCUMENTS))?; wtxn.commit()?; @@ -255,6 +268,9 @@ impl Index { field_id_docid_facet_f64s, field_id_docid_facet_strings, vector_id_docid, + vector_arroy, + docid_vector_ids, + embedder_category_id, documents, }) } @@ -477,63 +493,6 @@ impl Index { None => Ok(RoaringBitmap::new()), } } - - /* vector HNSW */ - - /// Writes the provided `hnsw`. - pub(crate) fn put_vector_hnsw(&self, wtxn: &mut RwTxn, hnsw: &Hnsw) -> heed::Result<()> { - // We must delete all the chunks before we write the new HNSW chunks. - self.delete_vector_hnsw(wtxn)?; - - let chunk_size = 1024 * 1024 * (1024 + 512); // 1.5 GiB - let bytes = bincode::serialize(hnsw).map_err(Into::into).map_err(heed::Error::Encoding)?; - for (i, chunk) in bytes.chunks(chunk_size).enumerate() { - let i = i as u32; - let mut key = main_key::VECTOR_HNSW_KEY_PREFIX.as_bytes().to_vec(); - key.extend_from_slice(&i.to_be_bytes()); - self.main.remap_types::().put(wtxn, &key, chunk)?; - } - Ok(()) - } - - /// Delete the `hnsw`. - pub(crate) fn delete_vector_hnsw(&self, wtxn: &mut RwTxn) -> heed::Result { - let mut iter = self - .main - .remap_types::() - .prefix_iter_mut(wtxn, main_key::VECTOR_HNSW_KEY_PREFIX.as_bytes())?; - let mut deleted = false; - while iter.next().transpose()?.is_some() { - // We do not keep a reference to the key or the value. - unsafe { deleted |= iter.del_current()? }; - } - Ok(deleted) - } - - /// Returns the `hnsw`. - pub fn vector_hnsw(&self, rtxn: &RoTxn) -> Result> { - let mut slices = Vec::new(); - for result in self - .main - .remap_types::() - .prefix_iter(rtxn, main_key::VECTOR_HNSW_KEY_PREFIX)? - { - let (_, slice) = result?; - slices.push(slice); - } - - if slices.is_empty() { - Ok(None) - } else { - let readable_slices: ReadableSlices<_> = slices.into_iter().collect(); - Ok(Some( - bincode::deserialize_from(readable_slices) - .map_err(Into::into) - .map_err(heed::Error::Decoding)?, - )) - } - } - /* field distribution */ /// Writes the field distribution which associates every field name with @@ -1557,6 +1516,30 @@ impl Index { .get(rtxn, main_key::EMBEDDING_CONFIGS)? .unwrap_or_default()) } + + pub(crate) fn put_unavailable_vector_ids( + &self, + wtxn: &mut RwTxn<'_>, + unavailable_vector_ids: RoaringBitmap, + ) -> heed::Result<()> { + self.main.remap_types::().put( + wtxn, + main_key::VECTOR_UNAVAILABLE_VECTOR_IDS, + &unavailable_vector_ids, + ) + } + + pub(crate) fn delete_unavailable_vector_ids(&self, wtxn: &mut RwTxn<'_>) -> heed::Result { + self.main.remap_key_type::().delete(wtxn, main_key::VECTOR_UNAVAILABLE_VECTOR_IDS) + } + + pub fn unavailable_vector_ids(&self, rtxn: &RoTxn<'_>) -> Result { + Ok(self + .main + .remap_types::() + .get(rtxn, main_key::VECTOR_UNAVAILABLE_VECTOR_IDS)? + .unwrap_or_default()) + } } #[cfg(test)] diff --git a/milli/src/lib.rs b/milli/src/lib.rs index b3c15e205..b865747e0 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -19,7 +19,6 @@ pub mod heed_codec; pub mod index; pub mod prompt; pub mod proximity; -mod readable_slices; pub mod score_details; mod search; pub mod update; diff --git a/milli/src/readable_slices.rs b/milli/src/readable_slices.rs deleted file mode 100644 index 7f5be214f..000000000 --- a/milli/src/readable_slices.rs +++ /dev/null @@ -1,85 +0,0 @@ -use std::io::{self, Read}; -use std::iter::FromIterator; - -pub struct ReadableSlices { - inner: Vec, - pos: u64, -} - -impl FromIterator for ReadableSlices { - fn from_iter>(iter: T) -> Self { - ReadableSlices { inner: iter.into_iter().collect(), pos: 0 } - } -} - -impl> Read for ReadableSlices { - fn read(&mut self, mut buf: &mut [u8]) -> io::Result { - let original_buf_len = buf.len(); - - // We explore the list of slices to find the one where we must start reading. - let mut pos = self.pos; - let index = match self - .inner - .iter() - .map(|s| s.as_ref().len() as u64) - .position(|size| pos.checked_sub(size).map(|p| pos = p).is_none()) - { - Some(index) => index, - None => return Ok(0), - }; - - let mut inner_pos = pos as usize; - for slice in &self.inner[index..] { - let slice = &slice.as_ref()[inner_pos..]; - - if buf.len() > slice.len() { - // We must exhaust the current slice and go to the next one there is not enough here. - buf[..slice.len()].copy_from_slice(slice); - buf = &mut buf[slice.len()..]; - inner_pos = 0; - } else { - // There is enough in this slice to fill the remaining bytes of the buffer. - // Let's break just after filling it. - buf.copy_from_slice(&slice[..buf.len()]); - buf = &mut []; - break; - } - } - - let written = original_buf_len - buf.len(); - self.pos += written as u64; - Ok(written) - } -} - -#[cfg(test)] -mod test { - use std::io::Read; - - use super::ReadableSlices; - - #[test] - fn basic() { - let data: Vec<_> = (0..100).collect(); - let splits: Vec<_> = data.chunks(3).collect(); - let mut rdslices: ReadableSlices<_> = splits.into_iter().collect(); - - let mut output = Vec::new(); - let length = rdslices.read_to_end(&mut output).unwrap(); - assert_eq!(length, data.len()); - assert_eq!(output, data); - } - - #[test] - fn small_reads() { - let data: Vec<_> = (0..u8::MAX).collect(); - let splits: Vec<_> = data.chunks(27).collect(); - let mut rdslices: ReadableSlices<_> = splits.into_iter().collect(); - - let buffer = &mut [0; 45]; - let length = rdslices.read(buffer).unwrap(); - let expected: Vec<_> = (0..buffer.len() as u8).collect(); - assert_eq!(length, buffer.len()); - assert_eq!(buffer, &expected[..]); - } -} diff --git a/milli/src/search/new/vector_sort.rs b/milli/src/search/new/vector_sort.rs index 831ed45cd..59b7a72c2 100644 --- a/milli/src/search/new/vector_sort.rs +++ b/milli/src/search/new/vector_sort.rs @@ -11,64 +11,31 @@ use crate::index::Hnsw; use crate::score_details::{self, ScoreDetails}; use crate::{Result, SearchContext, SearchLogger, UserError}; -pub struct VectorSort { +pub struct VectorSort<'ctx, Q: RankingRuleQueryTrait> { query: Option, target: Vec, vector_candidates: RoaringBitmap, - scope: nolife::DynBoxScope, + reader: arroy::Reader<'ctx, arroy::distances::DotProduct>, + limit: usize, } -type Item<'a> = instant_distance::Item<'a, NDotProductPoint>; -type SearchFut = Pin>>; - -struct SearchFamily; -impl<'a> nolife::Family<'a> for SearchFamily { - type Family = Box> + 'a>; -} - -async fn search_scope( - mut time_capsule: nolife::TimeCapsule, - hnsw: Hnsw, - target: Vec, -) -> nolife::Never { - let mut search = instant_distance::Search::default(); - let it = Box::new(hnsw.search(&NDotProductPoint::new(target), &mut search)); - let mut it: Box> = it; - loop { - time_capsule.freeze(&mut it).await; - } -} - -impl VectorSort { +impl<'ctx, Q: RankingRuleQueryTrait> VectorSort<'ctx, Q> { pub fn new( - ctx: &SearchContext, + ctx: &'ctx SearchContext, target: Vec, vector_candidates: RoaringBitmap, + limit: usize, ) -> Result { - let hnsw = - ctx.index.vector_hnsw(ctx.txn)?.unwrap_or(Hnsw::builder().build_hnsw(Vec::default()).0); - - if let Some(expected_size) = hnsw.iter().map(|(_, point)| point.len()).next() { - if target.len() != expected_size { - return Err(UserError::InvalidVectorDimensions { - expected: expected_size, - found: target.len(), - } - .into()); - } - } + /// FIXME? what to do in case of missing metadata + let reader = arroy::Reader::open(ctx.txn, 0, ctx.index.vector_arroy)?; let target_clone = target.clone(); - let producer = move |time_capsule| -> SearchFut { - Box::pin(search_scope(time_capsule, hnsw, target_clone)) - }; - let scope = DynBoxScope::new(producer); - Ok(Self { query: None, target, vector_candidates, scope }) + Ok(Self { query: None, target, vector_candidates, reader, limit }) } } -impl<'ctx, Q: RankingRuleQueryTrait> RankingRule<'ctx, Q> for VectorSort { +impl<'ctx, Q: RankingRuleQueryTrait> RankingRule<'ctx, Q> for VectorSort<'ctx, Q> { fn id(&self) -> String { "vector_sort".to_owned() } @@ -108,11 +75,11 @@ impl<'ctx, Q: RankingRuleQueryTrait> RankingRule<'ctx, Q> for VectorSort { }), })); } - - let scope = &mut self.scope; let target = &self.target; let vector_candidates = &self.vector_candidates; + let result = self.reader.nns_by_vector(ctx.txn, &target, count, search_k, candidates) + scope.enter(|it| { for item in it.by_ref() { let item: Item = item; diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index 59adda3e8..3b1a6c5d8 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -43,6 +43,9 @@ impl<'t, 'i> ClearDocuments<'t, 'i> { field_id_docid_facet_f64s, field_id_docid_facet_strings, vector_id_docid, + vector_arroy, + docid_vector_ids, + embedder_category_id: _, documents, } = self.index; @@ -58,7 +61,6 @@ impl<'t, 'i> ClearDocuments<'t, 'i> { self.index.put_field_distribution(self.wtxn, &FieldDistribution::default())?; self.index.delete_geo_rtree(self.wtxn)?; self.index.delete_geo_faceted_documents_ids(self.wtxn)?; - self.index.delete_vector_hnsw(self.wtxn)?; // Clear the other databases. external_documents_ids.clear(self.wtxn)?; @@ -82,7 +84,11 @@ impl<'t, 'i> ClearDocuments<'t, 'i> { facet_id_string_docids.clear(self.wtxn)?; field_id_docid_facet_f64s.clear(self.wtxn)?; field_id_docid_facet_strings.clear(self.wtxn)?; + // vector + vector_arroy.clear(self.wtxn)?; vector_id_docid.clear(self.wtxn)?; + docid_vector_ids.clear(self.wtxn)?; + documents.clear(self.wtxn)?; Ok(number_of_documents) diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 40b0dcd61..06bc8b609 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -312,7 +312,8 @@ fn send_original_documents_data( lmdb_writer_sx_cloned.send(Ok(TypedChunk::VectorPoints { remove_vectors, embeddings, - expected_dimension, + /// FIXME: compute an expected dimension from the manual vectors if any + expected_dimension: expected_dimension.unwrap(), manual_vectors, })) } diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 76848b628..eaac26dd3 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -15,6 +15,7 @@ use crossbeam_channel::{Receiver, Sender}; use heed::types::Str; use heed::Database; use log::debug; +use rand::SeedableRng; use roaring::RoaringBitmap; use serde::{Deserialize, Serialize}; use slice_group_by::GroupBy; @@ -489,6 +490,9 @@ where } } + let writer = arroy::Writer::prepare(self.wtxn, self.index.vector_arroy, 0, 0)?; + writer.build(self.wtxn, &mut rand::rngs::StdRng::from_entropy(), None)?; + // We write the field distribution into the main database self.index.put_field_distribution(self.wtxn, &field_distribution)?; diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 36d230d00..bc82518ca 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -1,4 +1,4 @@ -use std::collections::{HashMap, HashSet}; +use std::collections::HashMap; use std::convert::TryInto; use std::fs::File; use std::io::{self, BufReader}; @@ -27,6 +27,7 @@ use crate::index::Hnsw; use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvReaderDelAdd}; use crate::update::facet::FacetsUpdate; use crate::update::index_documents::helpers::{as_cloneable_grenad, try_split_array_at}; +use crate::update::{available_documents_ids, AvailableDocumentsIds}; use crate::{lat_lng_to_xyz, DocumentId, FieldId, GeoPoint, Index, Result, SerializationError}; pub(crate) enum TypedChunk { @@ -50,7 +51,7 @@ pub(crate) enum TypedChunk { VectorPoints { remove_vectors: grenad::Reader>, embeddings: Option>>, - expected_dimension: Option, + expected_dimension: usize, manual_vectors: grenad::Reader>, }, ScriptLanguageDocids(HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>), @@ -106,7 +107,7 @@ impl TypedChunk { format!("GeoPoints {{ number_of_entries: {} }}", grenad.len()) } TypedChunk::VectorPoints{ remove_vectors, manual_vectors, embeddings, expected_dimension } => { - format!("VectorPoints {{ remove_vectors: {}, manual_vectors: {}, embeddings: {}, dimension: {} }}", remove_vectors.len(), manual_vectors.len(), embeddings.as_ref().map(|e| e.len()).unwrap_or_default(), expected_dimension.unwrap_or_default()) + format!("VectorPoints {{ remove_vectors: {}, manual_vectors: {}, embeddings: {}, dimension: {} }}", remove_vectors.len(), manual_vectors.len(), embeddings.as_ref().map(|e| e.len()).unwrap_or_default(), expected_dimension) } TypedChunk::ScriptLanguageDocids(sl_map) => { format!("ScriptLanguageDocids {{ number_of_entries: {} }}", sl_map.len()) @@ -373,46 +374,53 @@ pub(crate) fn write_typed_chunk_into_index( return Ok((RoaringBitmap::new(), is_merged_database)); } - let mut docid_vectors_map: HashMap>>> = - HashMap::new(); - - // We extract and store the previous vectors - if let Some(hnsw) = index.vector_hnsw(wtxn)? { - for (pid, point) in hnsw.iter() { - let pid_key = pid.into_inner(); - let docid = index.vector_id_docid.get(wtxn, &pid_key)?.unwrap(); - let vector: Vec<_> = point.iter().copied().map(OrderedFloat).collect(); - docid_vectors_map.entry(docid).or_default().insert(vector); - } - } + let mut unavailable_vector_ids = index.unavailable_vector_ids(&wtxn)?; + /// FIXME: allow customizing distance + /// FIXME: allow customizing index + let writer = arroy::Writer::prepare(wtxn, index.vector_arroy, 0, expected_dimension)?; // remove vectors for docids we want them removed let mut cursor = remove_vectors.into_cursor()?; while let Some((key, _)) = cursor.move_on_next()? { let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap(); - docid_vectors_map.remove(&docid); + let Some(to_remove_vector_ids) = index.docid_vector_ids.get(&wtxn, &docid)? else { + continue; + }; + unavailable_vector_ids -= to_remove_vector_ids; + + for item in to_remove_vector_ids { + writer.del_item(wtxn, item)?; + } } + let mut available_vector_ids = + AvailableDocumentsIds::from_documents_ids(&unavailable_vector_ids); // add generated embeddings - if let Some((embeddings, expected_dimension)) = embeddings.zip(expected_dimension) { + if let Some(embeddings) = embeddings { let mut cursor = embeddings.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap(); - let data: Vec> = - pod_collect_to_vec(value).into_iter().map(OrderedFloat).collect(); + let data = pod_collect_to_vec(value); // it is a code error to have embeddings and not expected_dimension let embeddings = crate::vector::Embeddings::from_inner(data, expected_dimension) // code error if we somehow got the wrong dimension .unwrap(); - let mut set = HashSet::new(); + let mut new_vector_ids = RoaringBitmap::new(); for embedding in embeddings.iter() { - set.insert(embedding.to_vec()); - } + /// FIXME: error when you get over 9000 + let next_vector_id = available_vector_ids.next().unwrap(); + unavailable_vector_ids.insert(next_vector_id); - docid_vectors_map.insert(docid, set); + new_vector_ids.insert(next_vector_id); + + index.vector_id_docid.put(wtxn, &next_vector_id, &docid)?; + + writer.add_item(wtxn, next_vector_id, embedding)?; + } + index.docid_vector_ids.put(wtxn, &docid, &new_vector_ids)?; } } @@ -425,68 +433,44 @@ pub(crate) fn write_typed_chunk_into_index( let vector_deladd_obkv = KvReaderDelAdd::new(value); if let Some(value) = vector_deladd_obkv.get(DelAdd::Deletion) { - // convert the vector back to a Vec - let vector: Vec> = - pod_collect_to_vec(value).into_iter().map(OrderedFloat).collect(); - docid_vectors_map.entry(docid).and_modify(|v| { - if !v.remove(&vector) { - error!("Unable to delete the vector: {:?}", vector); + let vector = pod_collect_to_vec(value); + let Some(mut docid_vector_ids) = index.docid_vector_ids.get(&wtxn, &docid)? + else { + error!("Unable to delete the vector: {:?}", vector); + continue; + }; + for item in docid_vector_ids { + /// FIXME: comparing the vectors by equality is inefficient, and dangerous by perfect equality + let candidate = writer.item_vector(&wtxn, item)?.expect("Inconsistent dbs"); + if candidate == vector { + writer.del_item(wtxn, item)?; + unavailable_vector_ids.remove(item); + index.vector_id_docid.delete(wtxn, &item)?; + docid_vector_ids.remove(item); + break; } - }); - } - if let Some(value) = vector_deladd_obkv.get(DelAdd::Addition) { - // convert the vector back to a Vec - let vector = pod_collect_to_vec(value).into_iter().map(OrderedFloat).collect(); - docid_vectors_map.entry(docid).and_modify(|v| { - v.insert(vector); - }); - } - } - - // Extract the most common vector dimension - let expected_dimension_size = { - let mut dims = HashMap::new(); - docid_vectors_map - .values() - .flat_map(|v| v.iter()) - .for_each(|v| *dims.entry(v.len()).or_insert(0) += 1); - dims.into_iter().max_by_key(|(_, count)| *count).map(|(len, _)| len) - }; - - // Ensure that the vector lengths are correct and - // prepare the vectors before inserting them in the HNSW. - let mut points = Vec::new(); - let mut docids = Vec::new(); - for (docid, vector) in docid_vectors_map - .into_iter() - .flat_map(|(docid, vectors)| std::iter::repeat(docid).zip(vectors)) - { - if expected_dimension_size.map_or(false, |expected| expected != vector.len()) { - return Err(UserError::InvalidVectorDimensions { - expected: expected_dimension_size.unwrap_or(vector.len()), - found: vector.len(), } - .into()); - } else { - let vector = vector.into_iter().map(OrderedFloat::into_inner).collect(); - points.push(NDotProductPoint::new(vector)); - docids.push(docid); + index.docid_vector_ids.put(wtxn, &docid, &docid_vector_ids)?; + } + let mut available_vector_ids = + AvailableDocumentsIds::from_documents_ids(&unavailable_vector_ids); + + if let Some(value) = vector_deladd_obkv.get(DelAdd::Addition) { + let vector = pod_collect_to_vec(value); + let next_vector_id = available_vector_ids.next().unwrap(); + + writer.add_item(wtxn, next_vector_id, &vector)?; + unavailable_vector_ids.insert(next_vector_id); + index.vector_id_docid.put(wtxn, &next_vector_id, &docid)?; + let mut docid_vector_ids = + index.docid_vector_ids.get(&wtxn, &docid)?.unwrap_or_default(); + docid_vector_ids.insert(next_vector_id); + index.docid_vector_ids.put(wtxn, &docid, &docid_vector_ids)?; } } - let hnsw_length = points.len(); - let (new_hnsw, pids) = Hnsw::builder().build_hnsw(points); - - assert_eq!(docids.len(), pids.len()); - - // Store the vectors in the point-docid relation database - index.vector_id_docid.clear(wtxn)?; - for (docid, pid) in docids.into_iter().zip(pids) { - index.vector_id_docid.put(wtxn, &pid.into_inner(), &docid)?; - } - - log::debug!("There are {} entries in the HNSW so far", hnsw_length); - index.put_vector_hnsw(wtxn, &new_hnsw)?; + log::debug!("There are {} entries in the arroy so far", unavailable_vector_ids.len()); + index.put_unavailable_vector_ids(wtxn, unavailable_vector_ids)?; } TypedChunk::ScriptLanguageDocids(sl_map) => { for (key, (deletion, addition)) in sl_map {