From 4b64c33aa2525a8fc79e7a318ec2566c867e5f66 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 31 Oct 2023 17:44:42 +0100 Subject: [PATCH] update vector extractor --- .../extract/extract_vector_points.rs | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_vector_points.rs b/milli/src/update/index_documents/extract/extract_vector_points.rs index 863bc07c3..9aed862ab 100644 --- a/milli/src/update/index_documents/extract/extract_vector_points.rs +++ b/milli/src/update/index_documents/extract/extract_vector_points.rs @@ -7,7 +7,8 @@ use serde_json::{from_slice, Value}; use super::helpers::{create_writer, writer_into_reader, GrenadParameters}; use crate::error::UserError; -use crate::{FieldId, InternalError, Result, VectorOrArrayOfVectors}; +use crate::update::index_documents::helpers::try_split_at; +use crate::{DocumentId, FieldId, InternalError, Result, VectorOrArrayOfVectors}; /// Extracts the embedding vector contained in each document under the `_vectors` field. /// @@ -28,15 +29,17 @@ pub fn extract_vector_points( ); let mut cursor = obkv_documents.into_cursor()?; - while let Some((docid_bytes, value)) = cursor.move_on_next()? { + while let Some((key, value)) = cursor.move_on_next()? { + // this must always be serialized as (docid, external_docid); + let (docid_bytes, external_id_bytes) = + try_split_at(key, std::mem::size_of::()).unwrap(); + debug_assert!(std::str::from_utf8(external_id_bytes).is_ok()); + let obkv = obkv::KvReader::new(value); // since we only needs the primary key when we throw an error we create this getter to // lazily get it when needed - let document_id = || -> Value { - let document_id = obkv.get(primary_key_id).unwrap(); - from_slice(document_id).unwrap() - }; + let document_id = || -> Value { std::str::from_utf8(external_id_bytes).unwrap().into() }; // first we retrieve the _vectors field if let Some(vectors) = obkv.get(vectors_fid) {