Merge #4039

4039: Fix multiple vectors dimensions r=ManyTheFish a=Kerollmops This PR fixes #4035, making providing multiple vectors in documents possible. This is fixed by extracting the vectors from the non-flattened version of the documents. Co-authored-by: Kerollmops <clement@meilisearch.com>
2024-11-25 19:45:05 +08:00 · 2023-09-07 09:25:58 +00:00 · 2023-09-07 09:25:58 +00:00 · 256cf33bca
commit 256cf33bca
parent 9945cbf9db 679c0b0f97
2 changed files with 48 additions and 22 deletions
--- a/milli/src/update/index_documents/extract/mod.rs
+++ b/milli/src/update/index_documents/extract/mod.rs
@ -55,7 +55,13 @@ pub(crate) fn data_from_obkv_documents(
    original_obkv_chunks
        .par_bridge()
        .map(|original_documents_chunk| {
-            send_original_documents_data(original_documents_chunk, lmdb_writer_sx.clone())
+            send_original_documents_data(
                original_documents_chunk,
                indexer,
                lmdb_writer_sx.clone(),
                vectors_field_id,
                primary_key_id,
            )
        })
        .collect::<Result<()>>()?;
@ -72,7 +78,6 @@ pub(crate) fn data_from_obkv_documents(
                    &faceted_fields,
                    primary_key_id,
                    geo_fields_ids,
                    vectors_field_id,
                    &stop_words,
                    max_positions_per_attributes,
                )
@ -257,11 +262,33 @@ fn spawn_extraction_task<FE, FS, M>(
 /// - documents
 fn send_original_documents_data(
    original_documents_chunk: Result<grenad::Reader<File>>,
    indexer: GrenadParameters,
    lmdb_writer_sx: Sender<Result<TypedChunk>>,
    vectors_field_id: Option<FieldId>,
    primary_key_id: FieldId,
 ) -> Result<()> {
    let original_documents_chunk =
        original_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?;
    if let Some(vectors_field_id) = vectors_field_id {
        let documents_chunk_cloned = original_documents_chunk.clone();
        let lmdb_writer_sx_cloned = lmdb_writer_sx.clone();
        rayon::spawn(move || {
            let result = extract_vector_points(
                documents_chunk_cloned,
                indexer,
                primary_key_id,
                vectors_field_id,
            );
            let _ = match result {
                Ok(vector_points) => {
                    lmdb_writer_sx_cloned.send(Ok(TypedChunk::VectorPoints(vector_points)))
                }
                Err(error) => lmdb_writer_sx_cloned.send(Err(error)),
            };
        });
    }
    // TODO: create a custom internal error
    lmdb_writer_sx.send(Ok(TypedChunk::Documents(original_documents_chunk))).unwrap();
    Ok(())
@ -283,7 +310,6 @@ fn send_and_extract_flattened_documents_data(
    faceted_fields: &HashSet<FieldId>,
    primary_key_id: FieldId,
    geo_fields_ids: Option<(FieldId, FieldId)>,
    vectors_field_id: Option<FieldId>,
    stop_words: &Option<fst::Set<&[u8]>>,
    max_positions_per_attributes: Option<u32>,
 ) -> Result<(
@ -312,25 +338,6 @@ fn send_and_extract_flattened_documents_data(
        });
    }
    if let Some(vectors_field_id) = vectors_field_id {
        let documents_chunk_cloned = flattened_documents_chunk.clone();
        let lmdb_writer_sx_cloned = lmdb_writer_sx.clone();
        rayon::spawn(move || {
            let result = extract_vector_points(
                documents_chunk_cloned,
                indexer,
                primary_key_id,
                vectors_field_id,
            );
            let _ = match result {
                Ok(vector_points) => {
                    lmdb_writer_sx_cloned.send(Ok(TypedChunk::VectorPoints(vector_points)))
                }
                Err(error) => lmdb_writer_sx_cloned.send(Err(error)),
            };
        });
    }
    let (docid_word_positions_chunk, docid_fid_facet_values_chunks): (Result<_>, Result<_>) =
        rayon::join(
            || {
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@ -2519,6 +2519,25 @@ mod tests {
        db_snap!(index, word_position_docids, 3, @"74f556b91d161d997a89468b4da1cb8f");
    }
    /// Index multiple different number of vectors in documents.
    /// Vectors must be of the same length.
    #[test]
    fn test_multiple_vectors() {
        let index = TempIndex::new();
        index.add_documents(documents!([{"id": 0, "_vectors": [[0, 1, 2], [3, 4, 5]] }])).unwrap();
        index.add_documents(documents!([{"id": 1, "_vectors": [6, 7, 8] }])).unwrap();
        index
            .add_documents(
                documents!([{"id": 2, "_vectors": [[9, 10, 11], [12, 13, 14], [15, 16, 17]] }]),
            )
            .unwrap();
        let rtxn = index.read_txn().unwrap();
        let res = index.search(&rtxn).vector([0.0, 1.0, 2.0]).execute().unwrap();
        assert_eq!(res.documents_ids.len(), 3);
    }
    #[test]
    fn reproduce_the_bug() {
        /*