mirror of
https://github.com/meilisearch/meilisearch.git
synced 2024-11-29 08:35:15 +08:00
Merge #4039
4039: Fix multiple vectors dimensions r=ManyTheFish a=Kerollmops This PR fixes #4035, making providing multiple vectors in documents possible. This is fixed by extracting the vectors from the non-flattened version of the documents. Co-authored-by: Kerollmops <clement@meilisearch.com>
This commit is contained in:
commit
256cf33bca
@ -55,7 +55,13 @@ pub(crate) fn data_from_obkv_documents(
|
|||||||
original_obkv_chunks
|
original_obkv_chunks
|
||||||
.par_bridge()
|
.par_bridge()
|
||||||
.map(|original_documents_chunk| {
|
.map(|original_documents_chunk| {
|
||||||
send_original_documents_data(original_documents_chunk, lmdb_writer_sx.clone())
|
send_original_documents_data(
|
||||||
|
original_documents_chunk,
|
||||||
|
indexer,
|
||||||
|
lmdb_writer_sx.clone(),
|
||||||
|
vectors_field_id,
|
||||||
|
primary_key_id,
|
||||||
|
)
|
||||||
})
|
})
|
||||||
.collect::<Result<()>>()?;
|
.collect::<Result<()>>()?;
|
||||||
|
|
||||||
@ -72,7 +78,6 @@ pub(crate) fn data_from_obkv_documents(
|
|||||||
&faceted_fields,
|
&faceted_fields,
|
||||||
primary_key_id,
|
primary_key_id,
|
||||||
geo_fields_ids,
|
geo_fields_ids,
|
||||||
vectors_field_id,
|
|
||||||
&stop_words,
|
&stop_words,
|
||||||
max_positions_per_attributes,
|
max_positions_per_attributes,
|
||||||
)
|
)
|
||||||
@ -257,11 +262,33 @@ fn spawn_extraction_task<FE, FS, M>(
|
|||||||
/// - documents
|
/// - documents
|
||||||
fn send_original_documents_data(
|
fn send_original_documents_data(
|
||||||
original_documents_chunk: Result<grenad::Reader<File>>,
|
original_documents_chunk: Result<grenad::Reader<File>>,
|
||||||
|
indexer: GrenadParameters,
|
||||||
lmdb_writer_sx: Sender<Result<TypedChunk>>,
|
lmdb_writer_sx: Sender<Result<TypedChunk>>,
|
||||||
|
vectors_field_id: Option<FieldId>,
|
||||||
|
primary_key_id: FieldId,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
let original_documents_chunk =
|
let original_documents_chunk =
|
||||||
original_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?;
|
original_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?;
|
||||||
|
|
||||||
|
if let Some(vectors_field_id) = vectors_field_id {
|
||||||
|
let documents_chunk_cloned = original_documents_chunk.clone();
|
||||||
|
let lmdb_writer_sx_cloned = lmdb_writer_sx.clone();
|
||||||
|
rayon::spawn(move || {
|
||||||
|
let result = extract_vector_points(
|
||||||
|
documents_chunk_cloned,
|
||||||
|
indexer,
|
||||||
|
primary_key_id,
|
||||||
|
vectors_field_id,
|
||||||
|
);
|
||||||
|
let _ = match result {
|
||||||
|
Ok(vector_points) => {
|
||||||
|
lmdb_writer_sx_cloned.send(Ok(TypedChunk::VectorPoints(vector_points)))
|
||||||
|
}
|
||||||
|
Err(error) => lmdb_writer_sx_cloned.send(Err(error)),
|
||||||
|
};
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
// TODO: create a custom internal error
|
// TODO: create a custom internal error
|
||||||
lmdb_writer_sx.send(Ok(TypedChunk::Documents(original_documents_chunk))).unwrap();
|
lmdb_writer_sx.send(Ok(TypedChunk::Documents(original_documents_chunk))).unwrap();
|
||||||
Ok(())
|
Ok(())
|
||||||
@ -283,7 +310,6 @@ fn send_and_extract_flattened_documents_data(
|
|||||||
faceted_fields: &HashSet<FieldId>,
|
faceted_fields: &HashSet<FieldId>,
|
||||||
primary_key_id: FieldId,
|
primary_key_id: FieldId,
|
||||||
geo_fields_ids: Option<(FieldId, FieldId)>,
|
geo_fields_ids: Option<(FieldId, FieldId)>,
|
||||||
vectors_field_id: Option<FieldId>,
|
|
||||||
stop_words: &Option<fst::Set<&[u8]>>,
|
stop_words: &Option<fst::Set<&[u8]>>,
|
||||||
max_positions_per_attributes: Option<u32>,
|
max_positions_per_attributes: Option<u32>,
|
||||||
) -> Result<(
|
) -> Result<(
|
||||||
@ -312,25 +338,6 @@ fn send_and_extract_flattened_documents_data(
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
if let Some(vectors_field_id) = vectors_field_id {
|
|
||||||
let documents_chunk_cloned = flattened_documents_chunk.clone();
|
|
||||||
let lmdb_writer_sx_cloned = lmdb_writer_sx.clone();
|
|
||||||
rayon::spawn(move || {
|
|
||||||
let result = extract_vector_points(
|
|
||||||
documents_chunk_cloned,
|
|
||||||
indexer,
|
|
||||||
primary_key_id,
|
|
||||||
vectors_field_id,
|
|
||||||
);
|
|
||||||
let _ = match result {
|
|
||||||
Ok(vector_points) => {
|
|
||||||
lmdb_writer_sx_cloned.send(Ok(TypedChunk::VectorPoints(vector_points)))
|
|
||||||
}
|
|
||||||
Err(error) => lmdb_writer_sx_cloned.send(Err(error)),
|
|
||||||
};
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
let (docid_word_positions_chunk, docid_fid_facet_values_chunks): (Result<_>, Result<_>) =
|
let (docid_word_positions_chunk, docid_fid_facet_values_chunks): (Result<_>, Result<_>) =
|
||||||
rayon::join(
|
rayon::join(
|
||||||
|| {
|
|| {
|
||||||
|
@ -2519,6 +2519,25 @@ mod tests {
|
|||||||
db_snap!(index, word_position_docids, 3, @"74f556b91d161d997a89468b4da1cb8f");
|
db_snap!(index, word_position_docids, 3, @"74f556b91d161d997a89468b4da1cb8f");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Index multiple different number of vectors in documents.
|
||||||
|
/// Vectors must be of the same length.
|
||||||
|
#[test]
|
||||||
|
fn test_multiple_vectors() {
|
||||||
|
let index = TempIndex::new();
|
||||||
|
|
||||||
|
index.add_documents(documents!([{"id": 0, "_vectors": [[0, 1, 2], [3, 4, 5]] }])).unwrap();
|
||||||
|
index.add_documents(documents!([{"id": 1, "_vectors": [6, 7, 8] }])).unwrap();
|
||||||
|
index
|
||||||
|
.add_documents(
|
||||||
|
documents!([{"id": 2, "_vectors": [[9, 10, 11], [12, 13, 14], [15, 16, 17]] }]),
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let rtxn = index.read_txn().unwrap();
|
||||||
|
let res = index.search(&rtxn).vector([0.0, 1.0, 2.0]).execute().unwrap();
|
||||||
|
assert_eq!(res.documents_ids.len(), 3);
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn reproduce_the_bug() {
|
fn reproduce_the_bug() {
|
||||||
/*
|
/*
|
||||||
|
Loading…
Reference in New Issue
Block a user