From 645a55317af91f37d68d26527568032016bf5393 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 24 Sep 2024 14:54:24 +0200 Subject: [PATCH] merge the build and quantize method --- milli/src/update/index_documents/mod.rs | 5 +-- milli/src/vector/mod.rs | 43 ++++++++++++++----------- 2 files changed, 25 insertions(+), 23 deletions(-) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index b03ab259a..e164a0817 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -713,10 +713,7 @@ where pool.install(|| { let mut writer = ArroyWrapper::new(vector_arroy, embedder_index, was_quantized); - if is_quantizing { - writer.quantize(wtxn, dimension)?; - } - writer.build(wtxn, &mut rng, dimension)?; + writer.build_and_quantize(wtxn, &mut rng, dimension, is_quantizing)?; Result::Ok(()) }) .map_err(InternalError::from)??; diff --git a/milli/src/vector/mod.rs b/milli/src/vector/mod.rs index 8341ab923..a33f76559 100644 --- a/milli/src/vector/mod.rs +++ b/milli/src/vector/mod.rs @@ -98,18 +98,37 @@ impl ArroyWrapper { Ok(false) } - /// TODO: We should early exit when it doesn't need to be built - pub fn build( - &self, + pub fn build_and_quantize( + &mut self, wtxn: &mut RwTxn, rng: &mut R, dimension: usize, + quantizing: bool, ) -> Result<(), arroy::Error> { for index in arroy_db_range_for_embedder(self.embedder_index) { if self.quantized { - arroy::Writer::new(self.quantized_db(), index, dimension).build(wtxn, rng, None)? + let writer = arroy::Writer::new(self.quantized_db(), index, dimension); + if writer.need_build(wtxn)? { + writer.build(wtxn, rng, None)? + } else if writer.is_empty(wtxn)? { + break; + } } else { - arroy::Writer::new(self.angular_db(), index, dimension).build(wtxn, rng, None)? + let writer = arroy::Writer::new(self.angular_db(), index, dimension); + // If we are quantizing the databases, we can't know from meilisearch + // if the db was empty but still contained the wrong metadata, thus we need + // to quantize everything and can't stop early. Since this operation can + // only happens once in the life of an embedder, it's not very performances + // sensitive. + if quantizing && !self.quantized { + let writer = + writer.prepare_changing_distance::(wtxn)?; + writer.build(wtxn, rng, None)? + } else if writer.need_build(wtxn)? { + writer.build(wtxn, rng, None)? + } else if writer.is_empty(wtxn)? { + break; + } } } Ok(()) @@ -266,20 +285,6 @@ impl ArroyWrapper { Ok(()) } - pub fn is_empty(&self, rtxn: &RoTxn, dimension: usize) -> Result { - for index in arroy_db_range_for_embedder(self.embedder_index) { - let empty = if self.quantized { - arroy::Writer::new(self.quantized_db(), index, dimension).is_empty(rtxn)? - } else { - arroy::Writer::new(self.angular_db(), index, dimension).is_empty(rtxn)? - }; - if !empty { - return Ok(false); - } - } - Ok(true) - } - pub fn contains_item( &self, rtxn: &RoTxn,