Add raw versions of parsed vectors

This commit is contained in:
Louis Dureuil 2024-10-21 10:39:05 +02:00
parent 1a3f4e719d
commit aff8ca4397
No known key found for this signature in database

View File

@ -2,6 +2,7 @@ use std::collections::{BTreeMap, BTreeSet};
use deserr::{take_cf_content, DeserializeError, Deserr, Sequence}; use deserr::{take_cf_content, DeserializeError, Deserr, Sequence};
use obkv::KvReader; use obkv::KvReader;
use serde_json::value::RawValue;
use serde_json::{from_slice, Value}; use serde_json::{from_slice, Value};
use super::Embedding; use super::Embedding;
@ -11,6 +12,13 @@ use crate::{DocumentId, FieldId, InternalError, UserError};
pub const RESERVED_VECTORS_FIELD_NAME: &str = "_vectors"; pub const RESERVED_VECTORS_FIELD_NAME: &str = "_vectors";
#[derive(serde::Serialize, serde::Deserialize, Debug)]
#[serde(untagged)]
pub enum RawVectors<'doc> {
Explicit(#[serde(borrow)] RawExplicitVectors<'doc>),
ImplicitlyUserProvided(#[serde(borrow)] &'doc RawValue),
}
#[derive(serde::Serialize, Debug)] #[derive(serde::Serialize, Debug)]
#[serde(untagged)] #[serde(untagged)]
pub enum Vectors { pub enum Vectors {
@ -69,6 +77,22 @@ impl Vectors {
} }
} }
impl<'doc> RawVectors<'doc> {
pub fn must_regenerate(&self) -> bool {
match self {
RawVectors::ImplicitlyUserProvided(_) => false,
RawVectors::Explicit(RawExplicitVectors { regenerate, .. }) => *regenerate,
}
}
pub fn embeddings(&self) -> Option<&'doc RawValue> {
match self {
RawVectors::ImplicitlyUserProvided(embeddings) => Some(embeddings),
RawVectors::Explicit(RawExplicitVectors { embeddings, regenerate: _ }) => *embeddings,
}
}
}
#[derive(serde::Serialize, Deserr, Debug)] #[derive(serde::Serialize, Deserr, Debug)]
#[serde(rename_all = "camelCase")] #[serde(rename_all = "camelCase")]
pub struct ExplicitVectors { pub struct ExplicitVectors {
@ -78,6 +102,15 @@ pub struct ExplicitVectors {
pub regenerate: bool, pub regenerate: bool,
} }
#[derive(serde::Serialize, serde::Deserialize, Debug)]
#[serde(rename_all = "camelCase")]
pub struct RawExplicitVectors<'doc> {
#[serde(borrow)]
#[serde(default)]
pub embeddings: Option<&'doc RawValue>,
pub regenerate: bool,
}
pub enum VectorState { pub enum VectorState {
Inline(Vectors), Inline(Vectors),
Manual, Manual,