forward the embedding config to the extractors

This commit is contained in:
Tamo 2024-05-30 12:16:06 +02:00
parent 9eb6f522ea
commit a73ccc78a6
3 changed files with 15 additions and 1 deletions

View File

@ -14,6 +14,7 @@ use roaring::RoaringBitmap;
use serde_json::Value; use serde_json::Value;
use super::helpers::{create_writer, writer_into_reader, GrenadParameters}; use super::helpers::{create_writer, writer_into_reader, GrenadParameters};
use crate::index::IndexEmbeddingConfig;
use crate::prompt::Prompt; use crate::prompt::Prompt;
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
use crate::update::settings::InnerIndexSettingsDiff; use crate::update::settings::InnerIndexSettingsDiff;
@ -96,6 +97,7 @@ struct EmbedderVectorExtractor {
pub fn extract_vector_points<R: io::Read + io::Seek>( pub fn extract_vector_points<R: io::Read + io::Seek>(
obkv_documents: grenad::Reader<R>, obkv_documents: grenad::Reader<R>,
indexer: GrenadParameters, indexer: GrenadParameters,
embedders_configs: &[IndexEmbeddingConfig],
settings_diff: &InnerIndexSettingsDiff, settings_diff: &InnerIndexSettingsDiff,
) -> Result<Vec<ExtractedVectorPoints>> { ) -> Result<Vec<ExtractedVectorPoints>> {
let reindex_vectors = settings_diff.reindex_vectors(); let reindex_vectors = settings_diff.reindex_vectors();

View File

@ -30,6 +30,7 @@ use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids
use self::extract_word_position_docids::extract_word_position_docids; use self::extract_word_position_docids::extract_word_position_docids;
use super::helpers::{as_cloneable_grenad, CursorClonableMmap, GrenadParameters}; use super::helpers::{as_cloneable_grenad, CursorClonableMmap, GrenadParameters};
use super::{helpers, TypedChunk}; use super::{helpers, TypedChunk};
use crate::index::IndexEmbeddingConfig;
use crate::update::settings::InnerIndexSettingsDiff; use crate::update::settings::InnerIndexSettingsDiff;
use crate::{FieldId, Result, ThreadPoolNoAbortBuilder}; use crate::{FieldId, Result, ThreadPoolNoAbortBuilder};
@ -43,6 +44,7 @@ pub(crate) fn data_from_obkv_documents(
indexer: GrenadParameters, indexer: GrenadParameters,
lmdb_writer_sx: Sender<Result<TypedChunk>>, lmdb_writer_sx: Sender<Result<TypedChunk>>,
primary_key_id: FieldId, primary_key_id: FieldId,
embedders_configs: Arc<Vec<IndexEmbeddingConfig>>,
settings_diff: Arc<InnerIndexSettingsDiff>, settings_diff: Arc<InnerIndexSettingsDiff>,
max_positions_per_attributes: Option<u32>, max_positions_per_attributes: Option<u32>,
) -> Result<()> { ) -> Result<()> {
@ -55,6 +57,7 @@ pub(crate) fn data_from_obkv_documents(
original_documents_chunk, original_documents_chunk,
indexer, indexer,
lmdb_writer_sx.clone(), lmdb_writer_sx.clone(),
embedders_configs.clone(),
settings_diff.clone(), settings_diff.clone(),
) )
}) })
@ -210,6 +213,7 @@ fn send_original_documents_data(
original_documents_chunk: Result<grenad::Reader<BufReader<File>>>, original_documents_chunk: Result<grenad::Reader<BufReader<File>>>,
indexer: GrenadParameters, indexer: GrenadParameters,
lmdb_writer_sx: Sender<Result<TypedChunk>>, lmdb_writer_sx: Sender<Result<TypedChunk>>,
embedders_configs: Arc<Vec<IndexEmbeddingConfig>>,
settings_diff: Arc<InnerIndexSettingsDiff>, settings_diff: Arc<InnerIndexSettingsDiff>,
) -> Result<()> { ) -> Result<()> {
let original_documents_chunk = let original_documents_chunk =
@ -226,11 +230,17 @@ fn send_original_documents_data(
if index_vectors { if index_vectors {
let settings_diff = settings_diff.clone(); let settings_diff = settings_diff.clone();
let embedders_configs = embedders_configs.clone();
let original_documents_chunk = original_documents_chunk.clone(); let original_documents_chunk = original_documents_chunk.clone();
let lmdb_writer_sx = lmdb_writer_sx.clone(); let lmdb_writer_sx = lmdb_writer_sx.clone();
rayon::spawn(move || { rayon::spawn(move || {
match extract_vector_points(original_documents_chunk.clone(), indexer, &settings_diff) { match extract_vector_points(
original_documents_chunk.clone(),
indexer,
&embedders_configs,
&settings_diff,
) {
Ok(extracted_vectors) => { Ok(extracted_vectors) => {
for ExtractedVectorPoints { for ExtractedVectorPoints {
manual_vectors, manual_vectors,

View File

@ -286,6 +286,7 @@ where
settings_diff.new.recompute_searchables(self.wtxn, self.index)?; settings_diff.new.recompute_searchables(self.wtxn, self.index)?;
let settings_diff = Arc::new(settings_diff); let settings_diff = Arc::new(settings_diff);
let embedders_configs = Arc::new(self.index.embedding_configs(self.wtxn)?);
let backup_pool; let backup_pool;
let pool = match self.indexer_config.thread_pool { let pool = match self.indexer_config.thread_pool {
@ -399,6 +400,7 @@ where
pool_params, pool_params,
lmdb_writer_sx.clone(), lmdb_writer_sx.clone(),
primary_key_id, primary_key_id,
embedders_configs.clone(),
settings_diff_cloned, settings_diff_cloned,
max_positions_per_attributes, max_positions_per_attributes,
) )