From e02d0064bd620b02e9d00772189c9fd772aa8cfa Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 6 Sep 2023 11:49:27 +0200 Subject: [PATCH 1/2] Add a test case scenario --- milli/src/update/index_documents/mod.rs | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 849e84035..3704faf44 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -2519,6 +2519,25 @@ mod tests { db_snap!(index, word_position_docids, 3, @"74f556b91d161d997a89468b4da1cb8f"); } + /// Index multiple different number of vectors in documents. + /// Vectors must be of the same length. + #[test] + fn test_multiple_vectors() { + let index = TempIndex::new(); + + index.add_documents(documents!([{"id": 0, "_vectors": [[0, 1, 2], [3, 4, 5]] }])).unwrap(); + index.add_documents(documents!([{"id": 1, "_vectors": [6, 7, 8] }])).unwrap(); + index + .add_documents( + documents!([{"id": 2, "_vectors": [[9, 10, 11], [12, 13, 14], [15, 16, 17]] }]), + ) + .unwrap(); + + let rtxn = index.read_txn().unwrap(); + let res = index.search(&rtxn).vector([0.0, 1.0, 2.0]).execute().unwrap(); + assert_eq!(res.documents_ids.len(), 3); + } + #[test] fn reproduce_the_bug() { /* From 679c0b0f970c111b3c86309354bb82ce6be47e3a Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 6 Sep 2023 12:20:25 +0200 Subject: [PATCH 2/2] Extract the vectors from the non-flattened version of the documents --- .../src/update/index_documents/extract/mod.rs | 51 +++++++++++-------- 1 file changed, 29 insertions(+), 22 deletions(-) diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 6259c7272..6a3d6d972 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -55,7 +55,13 @@ pub(crate) fn data_from_obkv_documents( original_obkv_chunks .par_bridge() .map(|original_documents_chunk| { - send_original_documents_data(original_documents_chunk, lmdb_writer_sx.clone()) + send_original_documents_data( + original_documents_chunk, + indexer, + lmdb_writer_sx.clone(), + vectors_field_id, + primary_key_id, + ) }) .collect::>()?; @@ -72,7 +78,6 @@ pub(crate) fn data_from_obkv_documents( &faceted_fields, primary_key_id, geo_fields_ids, - vectors_field_id, &stop_words, max_positions_per_attributes, ) @@ -257,11 +262,33 @@ fn spawn_extraction_task( /// - documents fn send_original_documents_data( original_documents_chunk: Result>, + indexer: GrenadParameters, lmdb_writer_sx: Sender>, + vectors_field_id: Option, + primary_key_id: FieldId, ) -> Result<()> { let original_documents_chunk = original_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?; + if let Some(vectors_field_id) = vectors_field_id { + let documents_chunk_cloned = original_documents_chunk.clone(); + let lmdb_writer_sx_cloned = lmdb_writer_sx.clone(); + rayon::spawn(move || { + let result = extract_vector_points( + documents_chunk_cloned, + indexer, + primary_key_id, + vectors_field_id, + ); + let _ = match result { + Ok(vector_points) => { + lmdb_writer_sx_cloned.send(Ok(TypedChunk::VectorPoints(vector_points))) + } + Err(error) => lmdb_writer_sx_cloned.send(Err(error)), + }; + }); + } + // TODO: create a custom internal error lmdb_writer_sx.send(Ok(TypedChunk::Documents(original_documents_chunk))).unwrap(); Ok(()) @@ -283,7 +310,6 @@ fn send_and_extract_flattened_documents_data( faceted_fields: &HashSet, primary_key_id: FieldId, geo_fields_ids: Option<(FieldId, FieldId)>, - vectors_field_id: Option, stop_words: &Option>, max_positions_per_attributes: Option, ) -> Result<( @@ -312,25 +338,6 @@ fn send_and_extract_flattened_documents_data( }); } - if let Some(vectors_field_id) = vectors_field_id { - let documents_chunk_cloned = flattened_documents_chunk.clone(); - let lmdb_writer_sx_cloned = lmdb_writer_sx.clone(); - rayon::spawn(move || { - let result = extract_vector_points( - documents_chunk_cloned, - indexer, - primary_key_id, - vectors_field_id, - ); - let _ = match result { - Ok(vector_points) => { - lmdb_writer_sx_cloned.send(Ok(TypedChunk::VectorPoints(vector_points))) - } - Err(error) => lmdb_writer_sx_cloned.send(Err(error)), - }; - }); - } - let (docid_word_positions_chunk, docid_fid_facet_values_chunks): (Result<_>, Result<_>) = rayon::join( || {