From 4b598fa648944a5f5f1cdd7ecbdadd1cb8d3d659 Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 30 Sep 2024 13:12:01 +0200 Subject: [PATCH 1/2] update arroy --- Cargo.lock | 5 +++-- index-scheduler/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- milli/src/error.rs | 1 + milli/src/vector/mod.rs | 33 +++++++++++++++++++++------------ 5 files changed, 27 insertions(+), 16 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3237d4e16..c85a59952 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -386,8 +386,9 @@ checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711" [[package]] name = "arroy" -version = "0.4.0" -source = "git+https://github.com/meilisearch/arroy/?rev=2386594dfb009ce08821a925ccc89fb8e30bf73d#2386594dfb009ce08821a925ccc89fb8e30bf73d" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dfc5f272f38fa063bbff0a7ab5219404e221493de005e2b4078c62d626ef567e" dependencies = [ "bytemuck", "byteorder", diff --git a/index-scheduler/Cargo.toml b/index-scheduler/Cargo.toml index 432a86382..e80311005 100644 --- a/index-scheduler/Cargo.toml +++ b/index-scheduler/Cargo.toml @@ -40,7 +40,7 @@ ureq = "2.10.0" uuid = { version = "1.10.0", features = ["serde", "v4"] } [dev-dependencies] -arroy = { git = "https://github.com/meilisearch/arroy/", rev = "2386594dfb009ce08821a925ccc89fb8e30bf73d" } +arroy = "0.5.0" big_s = "1.0.2" crossbeam = "0.8.4" insta = { version = "1.39.0", features = ["json", "redactions"] } diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 01384f496..df0e59496 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -80,7 +80,7 @@ hf-hub = { git = "https://github.com/dureuill/hf-hub.git", branch = "rust_tls", tiktoken-rs = "0.5.9" liquid = "0.26.6" rhai = { git = "https://github.com/rhaiscript/rhai", rev = "ef3df63121d27aacd838f366f2b83fd65f20a1e4", features = ["serde", "no_module", "no_custom_syntax", "no_time", "sync"] } -arroy = { git = "https://github.com/meilisearch/arroy/", rev = "2386594dfb009ce08821a925ccc89fb8e30bf73d" } +arroy = "0.5.0" rand = "0.8.5" tracing = "0.1.40" ureq = { version = "2.10.0", features = ["json"] } diff --git a/milli/src/error.rs b/milli/src/error.rs index 400d3d3be..840db7606 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -297,6 +297,7 @@ impl From for Error { arroy::Error::InvalidVecDimension { expected, received } => { Error::UserError(UserError::InvalidVectorDimensions { expected, found: received }) } + arroy::Error::BuildCancelled => Error::InternalError(InternalError::AbortedIndexation), arroy::Error::DatabaseFull | arroy::Error::InvalidItemAppend | arroy::Error::UnmatchingDistance { .. } diff --git a/milli/src/vector/mod.rs b/milli/src/vector/mod.rs index b6d6510af..097e93ad2 100644 --- a/milli/src/vector/mod.rs +++ b/milli/src/vector/mod.rs @@ -1,7 +1,7 @@ use std::collections::HashMap; use std::sync::Arc; -use arroy::distances::{Angular, BinaryQuantizedAngular}; +use arroy::distances::{BinaryQuantizedCosine, Cosine}; use arroy::ItemId; use deserr::{DeserializeError, Deserr}; use heed::{RoTxn, RwTxn, Unspecified}; @@ -87,7 +87,7 @@ impl ArroyWrapper { if self.quantized { let writer = arroy::Writer::new(self.quantized_db(), index, dimension); if writer.need_build(wtxn)? { - writer.build(wtxn, rng, None)? + writer.builder(rng).build(wtxn)? } else if writer.is_empty(wtxn)? { break; } @@ -99,11 +99,10 @@ impl ArroyWrapper { // only happens once in the life of an embedder, it's not very performances // sensitive. if quantizing && !self.quantized { - let writer = - writer.prepare_changing_distance::(wtxn)?; - writer.build(wtxn, rng, None)? + let writer = writer.prepare_changing_distance::(wtxn)?; + writer.builder(rng).build(wtxn)?; } else if writer.need_build(wtxn)? { - writer.build(wtxn, rng, None)? + writer.builder(rng).build(wtxn)?; } else if writer.is_empty(wtxn)? { break; } @@ -323,8 +322,13 @@ impl ArroyWrapper { let mut results = Vec::new(); for reader in self.readers(rtxn, db) { - let ret = reader?.nns_by_item(rtxn, item, limit, None, None, filter)?; - if let Some(mut ret) = ret { + let reader = reader?; + let mut searcher = reader.nns(limit); + if let Some(filter) = filter { + searcher.candidates(filter); + } + + if let Some(mut ret) = searcher.by_item(rtxn, item)? { results.append(&mut ret); } else { break; @@ -359,8 +363,13 @@ impl ArroyWrapper { let mut results = Vec::new(); for reader in self.readers(rtxn, db) { - let mut ret = reader?.nns_by_vector(rtxn, vector, limit, None, None, filter)?; - results.append(&mut ret); + let reader = reader?; + let mut searcher = reader.nns(limit); + if let Some(filter) = filter { + searcher.candidates(filter); + } + + results.append(&mut searcher.by_vector(rtxn, vector)?); } results.sort_unstable_by_key(|(_, distance)| OrderedFloat(*distance)); @@ -391,11 +400,11 @@ impl ArroyWrapper { Ok(vectors) } - fn angular_db(&self) -> arroy::Database { + fn angular_db(&self) -> arroy::Database { self.database.remap_data_type() } - fn quantized_db(&self) -> arroy::Database { + fn quantized_db(&self) -> arroy::Database { self.database.remap_data_type() } } From b1dc10e771a757826fe400280c8bac84976ce95b Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 1 Oct 2024 17:45:49 +0200 Subject: [PATCH 2/2] uses the new cancellation method in arroy --- milli/src/update/index_documents/mod.rs | 3 ++- milli/src/vector/mod.rs | 5 +++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index e164a0817..88d20fff0 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -699,6 +699,7 @@ where for (embedder_name, dimension) in dimension { let wtxn = &mut *self.wtxn; let vector_arroy = self.index.vector_arroy; + let cancel = &self.should_abort; let embedder_index = self.index.embedder_category_id.get(wtxn, &embedder_name)?.ok_or( InternalError::DatabaseMissingEntry { db_name: "embedder_category_id", key: None }, @@ -713,7 +714,7 @@ where pool.install(|| { let mut writer = ArroyWrapper::new(vector_arroy, embedder_index, was_quantized); - writer.build_and_quantize(wtxn, &mut rng, dimension, is_quantizing)?; + writer.build_and_quantize(wtxn, &mut rng, dimension, is_quantizing, cancel)?; Result::Ok(()) }) .map_err(InternalError::from)??; diff --git a/milli/src/vector/mod.rs b/milli/src/vector/mod.rs index 097e93ad2..571c02c8c 100644 --- a/milli/src/vector/mod.rs +++ b/milli/src/vector/mod.rs @@ -82,6 +82,7 @@ impl ArroyWrapper { rng: &mut R, dimension: usize, quantizing: bool, + cancel: &(impl Fn() -> bool + Sync + Send), ) -> Result<(), arroy::Error> { for index in arroy_db_range_for_embedder(self.embedder_index) { if self.quantized { @@ -100,9 +101,9 @@ impl ArroyWrapper { // sensitive. if quantizing && !self.quantized { let writer = writer.prepare_changing_distance::(wtxn)?; - writer.builder(rng).build(wtxn)?; + writer.builder(rng).cancel(cancel).build(wtxn)?; } else if writer.need_build(wtxn)? { - writer.builder(rng).build(wtxn)?; + writer.builder(rng).cancel(cancel).build(wtxn)?; } else if writer.is_empty(wtxn)? { break; }