Don't remove embedders that are not in the config from the document DB

This commit is contained in:
Louis Dureuil 2024-05-22 12:24:51 +02:00
parent 8f7c8ca7f0
commit 16037e2169
No known key found for this signature in database
2 changed files with 17 additions and 10 deletions

View File

@ -1,4 +1,4 @@
use std::collections::HashMap; use std::collections::{BTreeSet, HashMap};
use std::convert::TryInto; use std::convert::TryInto;
use std::fs::File; use std::fs::File;
use std::io::{self, BufReader}; use std::io::{self, BufReader};
@ -211,6 +211,8 @@ pub(crate) fn write_typed_chunk_into_index(
let mut docids = index.documents_ids(wtxn)?; let mut docids = index.documents_ids(wtxn)?;
let mut iter = merger.into_stream_merger_iter()?; let mut iter = merger.into_stream_merger_iter()?;
let embedders: BTreeSet<_> =
index.embedding_configs(wtxn)?.into_iter().map(|(k, _v)| k).collect();
let mut vectors_buffer = Vec::new(); let mut vectors_buffer = Vec::new();
while let Some((key, reader)) = iter.next()? { while let Some((key, reader)) = iter.next()? {
let mut writer: KvWriter<_, FieldId> = KvWriter::memory(); let mut writer: KvWriter<_, FieldId> = KvWriter::memory();
@ -225,9 +227,8 @@ pub(crate) fn write_typed_chunk_into_index(
let del_add_reader = KvReaderDelAdd::new(value); let del_add_reader = KvReaderDelAdd::new(value);
if let Some(addition) = del_add_reader.get(DelAdd::Addition) { if let Some(addition) = del_add_reader.get(DelAdd::Addition) {
let addition = match vectors_fid { let addition = if vectors_fid == Some(field_id) {
// for the "_vectors" field, only keep vectors that are marked as userProvided 'vectors: {
Some(vectors_fid) if vectors_fid == field_id => 'vectors: {
vectors_buffer.clear(); vectors_buffer.clear();
let Ok(mut vectors) = let Ok(mut vectors) =
crate::vector::parsed_vectors::ParsedVectors::from_bytes( crate::vector::parsed_vectors::ParsedVectors::from_bytes(
@ -237,7 +238,7 @@ pub(crate) fn write_typed_chunk_into_index(
// if the `_vectors` field cannot be parsed as map of vectors, just write it as-is // if the `_vectors` field cannot be parsed as map of vectors, just write it as-is
break 'vectors Some(addition); break 'vectors Some(addition);
}; };
vectors.retain_user_provided_vectors(); vectors.retain_user_provided_vectors(&embedders);
let crate::vector::parsed_vectors::ParsedVectors(vectors) = vectors; let crate::vector::parsed_vectors::ParsedVectors(vectors) = vectors;
if vectors.is_empty() { if vectors.is_empty() {
// skip writing empty `_vectors` map // skip writing empty `_vectors` map
@ -248,8 +249,10 @@ pub(crate) fn write_typed_chunk_into_index(
.map_err(InternalError::SerdeJson)?; .map_err(InternalError::SerdeJson)?;
Some(vectors_buffer.as_slice()) Some(vectors_buffer.as_slice())
} }
_ => Some(addition), } else {
Some(addition)
}; };
if let Some(addition) = addition { if let Some(addition) = addition {
writer.insert(field_id, addition)?; writer.insert(field_id, addition)?;
} }

View File

@ -1,4 +1,4 @@
use std::collections::BTreeMap; use std::collections::{BTreeMap, BTreeSet};
use obkv::KvReader; use obkv::KvReader;
use serde_json::{from_slice, Value}; use serde_json::{from_slice, Value};
@ -89,10 +89,14 @@ impl ParsedVectors {
Ok(ParsedVectors(value)) Ok(ParsedVectors(value))
} }
pub fn retain_user_provided_vectors(&mut self) { pub fn retain_user_provided_vectors(&mut self, embedders: &BTreeSet<String>) {
self.0.retain(|_k, v| match v { self.0.retain(|k, v| match v {
Vectors::ImplicitlyUserProvided(_) => true, Vectors::ImplicitlyUserProvided(_) => true,
Vectors::Explicit(ExplicitVectors { embeddings: _, user_provided }) => *user_provided, Vectors::Explicit(ExplicitVectors { embeddings: _, user_provided }) => {
*user_provided
// if the embedder is not in the config, then never touch it
|| !embedders.contains(k)
}
}); });
} }
} }