mirror of
https://github.com/meilisearch/meilisearch.git
synced 2024-11-22 18:17:39 +08:00
Vectors
This commit is contained in:
parent
7058959a46
commit
1075dd34bb
@ -1300,6 +1300,8 @@ impl IndexScheduler {
|
|||||||
|
|
||||||
let mut content_files_iter = content_files.iter();
|
let mut content_files_iter = content_files.iter();
|
||||||
let mut indexer = indexer::DocumentOperation::new(method);
|
let mut indexer = indexer::DocumentOperation::new(method);
|
||||||
|
let embedders = index.embedding_configs(index_wtxn)?;
|
||||||
|
let embedders = self.embedders(embedders)?;
|
||||||
for (operation, task) in operations.into_iter().zip(tasks.iter_mut()) {
|
for (operation, task) in operations.into_iter().zip(tasks.iter_mut()) {
|
||||||
match operation {
|
match operation {
|
||||||
DocumentOperation::Add(_content_uuid) => {
|
DocumentOperation::Add(_content_uuid) => {
|
||||||
@ -1374,6 +1376,7 @@ impl IndexScheduler {
|
|||||||
primary_key_has_been_set.then_some(primary_key),
|
primary_key_has_been_set.then_some(primary_key),
|
||||||
&pool,
|
&pool,
|
||||||
&document_changes,
|
&document_changes,
|
||||||
|
embedders,
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
// tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done");
|
// tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done");
|
||||||
@ -1460,6 +1463,8 @@ impl IndexScheduler {
|
|||||||
|
|
||||||
let indexer = UpdateByFunction::new(candidates, context.clone(), code.clone());
|
let indexer = UpdateByFunction::new(candidates, context.clone(), code.clone());
|
||||||
let document_changes = indexer.into_changes(&primary_key)?;
|
let document_changes = indexer.into_changes(&primary_key)?;
|
||||||
|
let embedders = index.embedding_configs(index_wtxn)?;
|
||||||
|
let embedders = self.embedders(embedders)?;
|
||||||
|
|
||||||
indexer::index(
|
indexer::index(
|
||||||
index_wtxn,
|
index_wtxn,
|
||||||
@ -1469,6 +1474,7 @@ impl IndexScheduler {
|
|||||||
None, // cannot change primary key in DocumentEdition
|
None, // cannot change primary key in DocumentEdition
|
||||||
&pool,
|
&pool,
|
||||||
&document_changes,
|
&document_changes,
|
||||||
|
embedders,
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
// tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done");
|
// tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done");
|
||||||
@ -1596,6 +1602,8 @@ impl IndexScheduler {
|
|||||||
let mut indexer = indexer::DocumentDeletion::new();
|
let mut indexer = indexer::DocumentDeletion::new();
|
||||||
indexer.delete_documents_by_docids(to_delete);
|
indexer.delete_documents_by_docids(to_delete);
|
||||||
let document_changes = indexer.into_changes(&indexer_alloc, primary_key);
|
let document_changes = indexer.into_changes(&indexer_alloc, primary_key);
|
||||||
|
let embedders = index.embedding_configs(index_wtxn)?;
|
||||||
|
let embedders = self.embedders(embedders)?;
|
||||||
|
|
||||||
indexer::index(
|
indexer::index(
|
||||||
index_wtxn,
|
index_wtxn,
|
||||||
@ -1605,6 +1613,7 @@ impl IndexScheduler {
|
|||||||
None, // document deletion never changes primary key
|
None, // document deletion never changes primary key
|
||||||
&pool,
|
&pool,
|
||||||
&document_changes,
|
&document_changes,
|
||||||
|
embedders,
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
// tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done");
|
// tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done");
|
||||||
|
@ -3,6 +3,7 @@ use std::marker::PhantomData;
|
|||||||
|
|
||||||
use crossbeam_channel::{IntoIter, Receiver, SendError, Sender};
|
use crossbeam_channel::{IntoIter, Receiver, SendError, Sender};
|
||||||
use grenad::Merger;
|
use grenad::Merger;
|
||||||
|
use hashbrown::HashMap;
|
||||||
use heed::types::Bytes;
|
use heed::types::Bytes;
|
||||||
use memmap2::Mmap;
|
use memmap2::Mmap;
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
@ -124,7 +125,32 @@ impl DocumentDeletionEntry {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct WriterOperation {
|
pub enum WriterOperation {
|
||||||
|
DbOperation(DbOperation),
|
||||||
|
ArroyOperation(ArroyOperation),
|
||||||
|
}
|
||||||
|
|
||||||
|
pub enum ArroyOperation {
|
||||||
|
/// TODO: call when deleting regular documents
|
||||||
|
DeleteVectors {
|
||||||
|
docid: DocumentId,
|
||||||
|
},
|
||||||
|
SetVectors {
|
||||||
|
docid: DocumentId,
|
||||||
|
embedder_id: u8,
|
||||||
|
embeddings: Vec<Embedding>,
|
||||||
|
},
|
||||||
|
SetVector {
|
||||||
|
docid: DocumentId,
|
||||||
|
embedder_id: u8,
|
||||||
|
embedding: Embedding,
|
||||||
|
},
|
||||||
|
Finish {
|
||||||
|
user_provided: HashMap<String, RoaringBitmap>,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct DbOperation {
|
||||||
database: Database,
|
database: Database,
|
||||||
entry: EntryOperation,
|
entry: EntryOperation,
|
||||||
}
|
}
|
||||||
@ -180,7 +206,7 @@ impl From<FacetKind> for Database {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl WriterOperation {
|
impl DbOperation {
|
||||||
pub fn database(&self, index: &Index) -> heed::Database<Bytes, Bytes> {
|
pub fn database(&self, index: &Index) -> heed::Database<Bytes, Bytes> {
|
||||||
self.database.database(index)
|
self.database.database(index)
|
||||||
}
|
}
|
||||||
@ -246,13 +272,13 @@ impl MergerSender {
|
|||||||
DOCUMENTS_IDS_KEY.as_bytes(),
|
DOCUMENTS_IDS_KEY.as_bytes(),
|
||||||
documents_ids,
|
documents_ids,
|
||||||
));
|
));
|
||||||
match self.send(WriterOperation { database: Database::Main, entry }) {
|
match self.send_db_operation(DbOperation { database: Database::Main, entry }) {
|
||||||
Ok(()) => Ok(()),
|
Ok(()) => Ok(()),
|
||||||
Err(SendError(_)) => Err(SendError(())),
|
Err(SendError(_)) => Err(SendError(())),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn send(&self, op: WriterOperation) -> StdResult<(), SendError<()>> {
|
fn send_db_operation(&self, op: DbOperation) -> StdResult<(), SendError<()>> {
|
||||||
if self.sender.is_full() {
|
if self.sender.is_full() {
|
||||||
self.writer_contentious_count.set(self.writer_contentious_count.get() + 1);
|
self.writer_contentious_count.set(self.writer_contentious_count.get() + 1);
|
||||||
}
|
}
|
||||||
@ -260,7 +286,7 @@ impl MergerSender {
|
|||||||
self.merger_contentious_count.set(self.merger_contentious_count.get() + 1);
|
self.merger_contentious_count.set(self.merger_contentious_count.get() + 1);
|
||||||
}
|
}
|
||||||
self.send_count.set(self.send_count.get() + 1);
|
self.send_count.set(self.send_count.get() + 1);
|
||||||
match self.sender.send(op) {
|
match self.sender.send(WriterOperation::DbOperation(op)) {
|
||||||
Ok(()) => Ok(()),
|
Ok(()) => Ok(()),
|
||||||
Err(SendError(_)) => Err(SendError(())),
|
Err(SendError(_)) => Err(SendError(())),
|
||||||
}
|
}
|
||||||
@ -275,7 +301,7 @@ impl MainSender<'_> {
|
|||||||
WORDS_FST_KEY.as_bytes(),
|
WORDS_FST_KEY.as_bytes(),
|
||||||
value,
|
value,
|
||||||
));
|
));
|
||||||
match self.0.send(WriterOperation { database: Database::Main, entry }) {
|
match self.0.send_db_operation(DbOperation { database: Database::Main, entry }) {
|
||||||
Ok(()) => Ok(()),
|
Ok(()) => Ok(()),
|
||||||
Err(SendError(_)) => Err(SendError(())),
|
Err(SendError(_)) => Err(SendError(())),
|
||||||
}
|
}
|
||||||
@ -286,7 +312,7 @@ impl MainSender<'_> {
|
|||||||
WORDS_PREFIXES_FST_KEY.as_bytes(),
|
WORDS_PREFIXES_FST_KEY.as_bytes(),
|
||||||
value,
|
value,
|
||||||
));
|
));
|
||||||
match self.0.send(WriterOperation { database: Database::Main, entry }) {
|
match self.0.send_db_operation(DbOperation { database: Database::Main, entry }) {
|
||||||
Ok(()) => Ok(()),
|
Ok(()) => Ok(()),
|
||||||
Err(SendError(_)) => Err(SendError(())),
|
Err(SendError(_)) => Err(SendError(())),
|
||||||
}
|
}
|
||||||
@ -294,7 +320,7 @@ impl MainSender<'_> {
|
|||||||
|
|
||||||
pub fn delete(&self, key: &[u8]) -> StdResult<(), SendError<()>> {
|
pub fn delete(&self, key: &[u8]) -> StdResult<(), SendError<()>> {
|
||||||
let entry = EntryOperation::Delete(KeyEntry::from_key(key));
|
let entry = EntryOperation::Delete(KeyEntry::from_key(key));
|
||||||
match self.0.send(WriterOperation { database: Database::Main, entry }) {
|
match self.0.send_db_operation(DbOperation { database: Database::Main, entry }) {
|
||||||
Ok(()) => Ok(()),
|
Ok(()) => Ok(()),
|
||||||
Err(SendError(_)) => Err(SendError(())),
|
Err(SendError(_)) => Err(SendError(())),
|
||||||
}
|
}
|
||||||
@ -396,7 +422,7 @@ pub struct WordDocidsSender<'a, D> {
|
|||||||
impl<D: DatabaseType> DocidsSender for WordDocidsSender<'_, D> {
|
impl<D: DatabaseType> DocidsSender for WordDocidsSender<'_, D> {
|
||||||
fn write(&self, key: &[u8], value: &[u8]) -> StdResult<(), SendError<()>> {
|
fn write(&self, key: &[u8], value: &[u8]) -> StdResult<(), SendError<()>> {
|
||||||
let entry = EntryOperation::Write(KeyValueEntry::from_small_key_value(key, value));
|
let entry = EntryOperation::Write(KeyValueEntry::from_small_key_value(key, value));
|
||||||
match self.sender.send(WriterOperation { database: D::DATABASE, entry }) {
|
match self.sender.send_db_operation(DbOperation { database: D::DATABASE, entry }) {
|
||||||
Ok(()) => Ok(()),
|
Ok(()) => Ok(()),
|
||||||
Err(SendError(_)) => Err(SendError(())),
|
Err(SendError(_)) => Err(SendError(())),
|
||||||
}
|
}
|
||||||
@ -404,7 +430,7 @@ impl<D: DatabaseType> DocidsSender for WordDocidsSender<'_, D> {
|
|||||||
|
|
||||||
fn delete(&self, key: &[u8]) -> StdResult<(), SendError<()>> {
|
fn delete(&self, key: &[u8]) -> StdResult<(), SendError<()>> {
|
||||||
let entry = EntryOperation::Delete(KeyEntry::from_key(key));
|
let entry = EntryOperation::Delete(KeyEntry::from_key(key));
|
||||||
match self.sender.send(WriterOperation { database: D::DATABASE, entry }) {
|
match self.sender.send_db_operation(DbOperation { database: D::DATABASE, entry }) {
|
||||||
Ok(()) => Ok(()),
|
Ok(()) => Ok(()),
|
||||||
Err(SendError(_)) => Err(SendError(())),
|
Err(SendError(_)) => Err(SendError(())),
|
||||||
}
|
}
|
||||||
@ -429,7 +455,7 @@ impl DocidsSender for FacetDocidsSender<'_> {
|
|||||||
}
|
}
|
||||||
_ => EntryOperation::Write(KeyValueEntry::from_small_key_value(key, value)),
|
_ => EntryOperation::Write(KeyValueEntry::from_small_key_value(key, value)),
|
||||||
};
|
};
|
||||||
match self.sender.send(WriterOperation { database, entry }) {
|
match self.sender.send_db_operation(DbOperation { database, entry }) {
|
||||||
Ok(()) => Ok(()),
|
Ok(()) => Ok(()),
|
||||||
Err(SendError(_)) => Err(SendError(())),
|
Err(SendError(_)) => Err(SendError(())),
|
||||||
}
|
}
|
||||||
@ -439,7 +465,7 @@ impl DocidsSender for FacetDocidsSender<'_> {
|
|||||||
let (facet_kind, key) = FacetKind::extract_from_key(key);
|
let (facet_kind, key) = FacetKind::extract_from_key(key);
|
||||||
let database = Database::from(facet_kind);
|
let database = Database::from(facet_kind);
|
||||||
let entry = EntryOperation::Delete(KeyEntry::from_key(key));
|
let entry = EntryOperation::Delete(KeyEntry::from_key(key));
|
||||||
match self.sender.send(WriterOperation { database, entry }) {
|
match self.sender.send_db_operation(DbOperation { database, entry }) {
|
||||||
Ok(()) => Ok(()),
|
Ok(()) => Ok(()),
|
||||||
Err(SendError(_)) => Err(SendError(())),
|
Err(SendError(_)) => Err(SendError(())),
|
||||||
}
|
}
|
||||||
@ -460,7 +486,7 @@ impl DocumentsSender<'_> {
|
|||||||
&docid.to_be_bytes(),
|
&docid.to_be_bytes(),
|
||||||
document.as_bytes(),
|
document.as_bytes(),
|
||||||
));
|
));
|
||||||
match self.0.send(WriterOperation { database: Database::Documents, entry }) {
|
match self.0.send_db_operation(DbOperation { database: Database::Documents, entry }) {
|
||||||
Ok(()) => Ok(()),
|
Ok(()) => Ok(()),
|
||||||
Err(SendError(_)) => Err(SendError(())),
|
Err(SendError(_)) => Err(SendError(())),
|
||||||
}?;
|
}?;
|
||||||
@ -469,7 +495,10 @@ impl DocumentsSender<'_> {
|
|||||||
external_id.as_bytes(),
|
external_id.as_bytes(),
|
||||||
&docid.to_be_bytes(),
|
&docid.to_be_bytes(),
|
||||||
));
|
));
|
||||||
match self.0.send(WriterOperation { database: Database::ExternalDocumentsIds, entry }) {
|
match self
|
||||||
|
.0
|
||||||
|
.send_db_operation(DbOperation { database: Database::ExternalDocumentsIds, entry })
|
||||||
|
{
|
||||||
Ok(()) => Ok(()),
|
Ok(()) => Ok(()),
|
||||||
Err(SendError(_)) => Err(SendError(())),
|
Err(SendError(_)) => Err(SendError(())),
|
||||||
}
|
}
|
||||||
@ -477,33 +506,38 @@ impl DocumentsSender<'_> {
|
|||||||
|
|
||||||
pub fn delete(&self, docid: DocumentId, external_id: String) -> StdResult<(), SendError<()>> {
|
pub fn delete(&self, docid: DocumentId, external_id: String) -> StdResult<(), SendError<()>> {
|
||||||
let entry = EntryOperation::Delete(KeyEntry::from_key(&docid.to_be_bytes()));
|
let entry = EntryOperation::Delete(KeyEntry::from_key(&docid.to_be_bytes()));
|
||||||
match self.0.send(WriterOperation { database: Database::Documents, entry }) {
|
match self.0.send_db_operation(DbOperation { database: Database::Documents, entry }) {
|
||||||
Ok(()) => Ok(()),
|
Ok(()) => Ok(()),
|
||||||
Err(SendError(_)) => Err(SendError(())),
|
Err(SendError(_)) => Err(SendError(())),
|
||||||
}?;
|
}?;
|
||||||
|
|
||||||
let entry = EntryOperation::Delete(KeyEntry::from_key(external_id.as_bytes()));
|
let entry = EntryOperation::Delete(KeyEntry::from_key(external_id.as_bytes()));
|
||||||
match self.0.send(WriterOperation { database: Database::ExternalDocumentsIds, entry }) {
|
match self
|
||||||
|
.0
|
||||||
|
.send_db_operation(DbOperation { database: Database::ExternalDocumentsIds, entry })
|
||||||
|
{
|
||||||
Ok(()) => Ok(()),
|
Ok(()) => Ok(()),
|
||||||
Err(SendError(_)) => Err(SendError(())),
|
Err(SendError(_)) => Err(SendError(())),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct EmbeddingSender<'a>(Option<&'a Sender<MergerOperation>>);
|
pub struct EmbeddingSender<'a>(&'a Sender<WriterOperation>);
|
||||||
|
|
||||||
impl EmbeddingSender<'_> {
|
impl EmbeddingSender<'_> {
|
||||||
pub fn delete(&self, docid: DocumentId, embedder_id: u8) -> StdResult<(), SendError<()>> {
|
|
||||||
todo!()
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn set_vectors(
|
pub fn set_vectors(
|
||||||
&self,
|
&self,
|
||||||
docid: DocumentId,
|
docid: DocumentId,
|
||||||
embedder_id: u8,
|
embedder_id: u8,
|
||||||
embeddings: Vec<Embedding>,
|
embeddings: Vec<Embedding>,
|
||||||
) -> StdResult<(), SendError<()>> {
|
) -> StdResult<(), SendError<()>> {
|
||||||
todo!()
|
self.0
|
||||||
|
.send(WriterOperation::ArroyOperation(ArroyOperation::SetVectors {
|
||||||
|
docid,
|
||||||
|
embedder_id,
|
||||||
|
embeddings,
|
||||||
|
}))
|
||||||
|
.map_err(|_| SendError(()))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn set_vector(
|
pub fn set_vector(
|
||||||
@ -512,19 +546,24 @@ impl EmbeddingSender<'_> {
|
|||||||
embedder_id: u8,
|
embedder_id: u8,
|
||||||
embedding: Embedding,
|
embedding: Embedding,
|
||||||
) -> StdResult<(), SendError<()>> {
|
) -> StdResult<(), SendError<()>> {
|
||||||
todo!()
|
self.0
|
||||||
|
.send(WriterOperation::ArroyOperation(ArroyOperation::SetVector {
|
||||||
|
docid,
|
||||||
|
embedder_id,
|
||||||
|
embedding,
|
||||||
|
}))
|
||||||
|
.map_err(|_| SendError(()))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn set_user_provided(
|
/// Marks all embedders as "to be built"
|
||||||
&self,
|
pub fn finish(
|
||||||
docid: DocumentId,
|
self,
|
||||||
embedder_id: u8,
|
user_provided: HashMap<String, RoaringBitmap>,
|
||||||
regenerate: bool,
|
|
||||||
) -> StdResult<(), SendError<()>> {
|
) -> StdResult<(), SendError<()>> {
|
||||||
todo!()
|
self.0
|
||||||
|
.send(WriterOperation::ArroyOperation(ArroyOperation::Finish { user_provided }))
|
||||||
|
.map_err(|_| SendError(()))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn finish(self, embedder_id: u8) {}
|
|
||||||
}
|
}
|
||||||
pub enum MergerOperation {
|
pub enum MergerOperation {
|
||||||
ExactWordDocidsMerger(Merger<File, MergeDeladdCboRoaringBitmaps>),
|
ExactWordDocidsMerger(Merger<File, MergeDeladdCboRoaringBitmaps>),
|
||||||
|
@ -4,7 +4,7 @@ use heed::RoTxn;
|
|||||||
use raw_collections::RawMap;
|
use raw_collections::RawMap;
|
||||||
use serde_json::value::RawValue;
|
use serde_json::value::RawValue;
|
||||||
|
|
||||||
use super::vector_document::{VectorDocument, VectorDocumentFromDb, VectorDocumentFromVersions};
|
use super::vector_document::VectorDocument;
|
||||||
use super::{KvReaderFieldId, KvWriterFieldId};
|
use super::{KvReaderFieldId, KvWriterFieldId};
|
||||||
use crate::documents::FieldIdMapper;
|
use crate::documents::FieldIdMapper;
|
||||||
use crate::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME;
|
use crate::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME;
|
||||||
|
@ -267,7 +267,7 @@ impl Stats {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone, Default)]
|
||||||
pub struct DelAddRoaringBitmap {
|
pub struct DelAddRoaringBitmap {
|
||||||
pub(crate) del: Option<RoaringBitmap>,
|
pub(crate) del: Option<RoaringBitmap>,
|
||||||
pub(crate) add: Option<RoaringBitmap>,
|
pub(crate) add: Option<RoaringBitmap>,
|
||||||
|
@ -11,6 +11,7 @@ use bumpalo::Bump;
|
|||||||
pub use faceted::*;
|
pub use faceted::*;
|
||||||
use grenad::Merger;
|
use grenad::Merger;
|
||||||
pub use searchable::*;
|
pub use searchable::*;
|
||||||
|
pub use vectors::EmbeddingExtractor;
|
||||||
|
|
||||||
use super::indexer::document_changes::{DocumentChanges, FullySend, IndexingContext, ThreadLocal};
|
use super::indexer::document_changes::{DocumentChanges, FullySend, IndexingContext, ThreadLocal};
|
||||||
use crate::update::{GrenadParameters, MergeDeladdCboRoaringBitmaps};
|
use crate::update::{GrenadParameters, MergeDeladdCboRoaringBitmaps};
|
||||||
|
@ -1,3 +1,10 @@
|
|||||||
|
use std::cell::RefCell;
|
||||||
|
|
||||||
|
use bumpalo::collections::Vec as BVec;
|
||||||
|
use bumpalo::Bump;
|
||||||
|
use hashbrown::HashMap;
|
||||||
|
|
||||||
|
use super::cache::DelAddRoaringBitmap;
|
||||||
use crate::error::FaultSource;
|
use crate::error::FaultSource;
|
||||||
use crate::prompt::Prompt;
|
use crate::prompt::Prompt;
|
||||||
use crate::update::new::channel::EmbeddingSender;
|
use crate::update::new::channel::EmbeddingSender;
|
||||||
@ -5,26 +12,34 @@ use crate::update::new::indexer::document_changes::{Extractor, FullySend};
|
|||||||
use crate::update::new::vector_document::VectorDocument;
|
use crate::update::new::vector_document::VectorDocument;
|
||||||
use crate::update::new::DocumentChange;
|
use crate::update::new::DocumentChange;
|
||||||
use crate::vector::error::EmbedErrorKind;
|
use crate::vector::error::EmbedErrorKind;
|
||||||
use crate::vector::Embedder;
|
use crate::vector::{Embedder, Embedding, EmbeddingConfigs};
|
||||||
use crate::{DocumentId, Result, ThreadPoolNoAbort, UserError};
|
use crate::{DocumentId, InternalError, Result, ThreadPoolNoAbort, UserError};
|
||||||
|
|
||||||
pub struct EmbeddingExtractor<'a> {
|
pub struct EmbeddingExtractor<'a> {
|
||||||
embedder: &'a Embedder,
|
embedders: &'a EmbeddingConfigs,
|
||||||
prompt: &'a Prompt,
|
|
||||||
embedder_id: u8,
|
|
||||||
embedder_name: &'a str,
|
|
||||||
sender: &'a EmbeddingSender<'a>,
|
sender: &'a EmbeddingSender<'a>,
|
||||||
threads: &'a ThreadPoolNoAbort,
|
threads: &'a ThreadPoolNoAbort,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl<'a> EmbeddingExtractor<'a> {
|
||||||
|
pub fn new(
|
||||||
|
embedders: &'a EmbeddingConfigs,
|
||||||
|
sender: &'a EmbeddingSender<'a>,
|
||||||
|
threads: &'a ThreadPoolNoAbort,
|
||||||
|
) -> Self {
|
||||||
|
Self { embedders, sender, threads }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> {
|
impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> {
|
||||||
type Data = FullySend<()>;
|
type Data = FullySend<RefCell<HashMap<String, DelAddRoaringBitmap>>>;
|
||||||
|
|
||||||
fn init_data<'doc>(
|
fn init_data<'doc>(
|
||||||
&'doc self,
|
&'doc self,
|
||||||
_extractor_alloc: raw_collections::alloc::RefBump<'extractor>,
|
_extractor_alloc: raw_collections::alloc::RefBump<'extractor>,
|
||||||
) -> crate::Result<Self::Data> {
|
) -> crate::Result<Self::Data> {
|
||||||
Ok(FullySend(()))
|
/// TODO: use the extractor_alloc in the hashbrown once you merge the branch where it is no longer a RefBump
|
||||||
|
Ok(FullySend(Default::default()))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn process<'doc>(
|
fn process<'doc>(
|
||||||
@ -34,63 +49,90 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> {
|
|||||||
Self::Data,
|
Self::Data,
|
||||||
>,
|
>,
|
||||||
) -> crate::Result<()> {
|
) -> crate::Result<()> {
|
||||||
let embedder_name: &str = self.embedder_name;
|
let embedders = self.embedders.inner_as_ref();
|
||||||
let embedder: &Embedder = self.embedder;
|
|
||||||
let prompt: &Prompt = self.prompt;
|
|
||||||
|
|
||||||
let mut chunks = Chunks::new(
|
let mut all_chunks = BVec::with_capacity_in(embedders.len(), &context.doc_alloc);
|
||||||
embedder,
|
for (embedder_name, (embedder, prompt, _is_quantized)) in embedders {
|
||||||
self.embedder_id,
|
let embedder_id =
|
||||||
embedder_name,
|
context.index.embedder_category_id.get(&context.txn, embedder_name)?.ok_or_else(
|
||||||
self.threads,
|
|| InternalError::DatabaseMissingEntry {
|
||||||
self.sender,
|
db_name: "embedder_category_id",
|
||||||
&context.doc_alloc,
|
key: None,
|
||||||
);
|
},
|
||||||
|
)?;
|
||||||
|
all_chunks.push(Chunks::new(
|
||||||
|
embedder,
|
||||||
|
embedder_id,
|
||||||
|
embedder_name,
|
||||||
|
prompt,
|
||||||
|
&context.data.0,
|
||||||
|
self.threads,
|
||||||
|
self.sender,
|
||||||
|
&context.doc_alloc,
|
||||||
|
))
|
||||||
|
}
|
||||||
|
|
||||||
for change in changes {
|
for change in changes {
|
||||||
let change = change?;
|
let change = change?;
|
||||||
match change {
|
match change {
|
||||||
DocumentChange::Deletion(deletion) => {
|
DocumentChange::Deletion(_deletion) => {
|
||||||
self.sender.delete(deletion.docid(), self.embedder_id).unwrap();
|
// handled by document sender
|
||||||
}
|
}
|
||||||
DocumentChange::Update(update) => {
|
DocumentChange::Update(update) => {
|
||||||
/// FIXME: this will force the parsing/retrieval of VectorDocument once per embedder
|
|
||||||
/// consider doing all embedders at once?
|
|
||||||
let old_vectors = update.current_vectors(
|
let old_vectors = update.current_vectors(
|
||||||
&context.txn,
|
&context.txn,
|
||||||
context.index,
|
context.index,
|
||||||
context.db_fields_ids_map,
|
context.db_fields_ids_map,
|
||||||
&context.doc_alloc,
|
&context.doc_alloc,
|
||||||
)?;
|
)?;
|
||||||
let old_vectors = old_vectors.vectors_for_key(embedder_name)?.unwrap();
|
|
||||||
let new_vectors = update.updated_vectors(&context.doc_alloc)?;
|
let new_vectors = update.updated_vectors(&context.doc_alloc)?;
|
||||||
if let Some(new_vectors) = new_vectors.as_ref().and_then(|new_vectors| {
|
|
||||||
new_vectors.vectors_for_key(embedder_name).transpose()
|
for chunks in &mut all_chunks {
|
||||||
}) {
|
let embedder_name = chunks.embedder_name();
|
||||||
let new_vectors = new_vectors?;
|
let prompt = chunks.prompt();
|
||||||
match (old_vectors.regenerate, new_vectors.regenerate) {
|
|
||||||
(true, true) | (false, false) => todo!(),
|
let old_vectors = old_vectors.vectors_for_key(embedder_name)?.unwrap();
|
||||||
_ => {
|
if let Some(new_vectors) = new_vectors.as_ref().and_then(|new_vectors| {
|
||||||
self.sender
|
new_vectors.vectors_for_key(embedder_name).transpose()
|
||||||
.set_user_provided(
|
}) {
|
||||||
update.docid(),
|
let new_vectors = new_vectors?;
|
||||||
self.embedder_id,
|
match (old_vectors.regenerate, new_vectors.regenerate) {
|
||||||
!new_vectors.regenerate,
|
(true, true) | (false, false) => todo!(),
|
||||||
)
|
_ => {
|
||||||
.unwrap();
|
chunks.set_regenerate(update.docid(), new_vectors.regenerate);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
// do we have set embeddings?
|
||||||
// do we have set embeddings?
|
if let Some(embeddings) = new_vectors.embeddings {
|
||||||
if let Some(embeddings) = new_vectors.embeddings {
|
chunks.set_vectors(
|
||||||
self.sender
|
|
||||||
.set_vectors(
|
|
||||||
update.docid(),
|
update.docid(),
|
||||||
self.embedder_id,
|
|
||||||
embeddings.into_vec().map_err(UserError::SerdeJson)?,
|
embeddings.into_vec().map_err(UserError::SerdeJson)?,
|
||||||
)
|
);
|
||||||
.unwrap();
|
} else if new_vectors.regenerate {
|
||||||
} else if new_vectors.regenerate {
|
let new_rendered = prompt.render_document(
|
||||||
let new_rendered = prompt.render_document(
|
update.current(
|
||||||
|
&context.txn,
|
||||||
|
context.index,
|
||||||
|
context.db_fields_ids_map,
|
||||||
|
)?,
|
||||||
|
context.new_fields_ids_map,
|
||||||
|
&context.doc_alloc,
|
||||||
|
)?;
|
||||||
|
let old_rendered = prompt.render_document(
|
||||||
|
update.merged(
|
||||||
|
&context.txn,
|
||||||
|
context.index,
|
||||||
|
context.db_fields_ids_map,
|
||||||
|
)?,
|
||||||
|
context.new_fields_ids_map,
|
||||||
|
&context.doc_alloc,
|
||||||
|
)?;
|
||||||
|
if new_rendered != old_rendered {
|
||||||
|
chunks.set_autogenerated(update.docid(), new_rendered)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if old_vectors.regenerate {
|
||||||
|
let old_rendered = prompt.render_document(
|
||||||
update.current(
|
update.current(
|
||||||
&context.txn,
|
&context.txn,
|
||||||
context.index,
|
context.index,
|
||||||
@ -99,7 +141,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> {
|
|||||||
context.new_fields_ids_map,
|
context.new_fields_ids_map,
|
||||||
&context.doc_alloc,
|
&context.doc_alloc,
|
||||||
)?;
|
)?;
|
||||||
let old_rendered = prompt.render_document(
|
let new_rendered = prompt.render_document(
|
||||||
update.merged(
|
update.merged(
|
||||||
&context.txn,
|
&context.txn,
|
||||||
context.index,
|
context.index,
|
||||||
@ -109,82 +151,55 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> {
|
|||||||
&context.doc_alloc,
|
&context.doc_alloc,
|
||||||
)?;
|
)?;
|
||||||
if new_rendered != old_rendered {
|
if new_rendered != old_rendered {
|
||||||
chunks.push(update.docid(), new_rendered)?;
|
chunks.set_autogenerated(update.docid(), new_rendered)?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else if old_vectors.regenerate {
|
|
||||||
let old_rendered = prompt.render_document(
|
|
||||||
update.current(
|
|
||||||
&context.txn,
|
|
||||||
context.index,
|
|
||||||
context.db_fields_ids_map,
|
|
||||||
)?,
|
|
||||||
context.new_fields_ids_map,
|
|
||||||
&context.doc_alloc,
|
|
||||||
)?;
|
|
||||||
let new_rendered = prompt.render_document(
|
|
||||||
update.merged(
|
|
||||||
&context.txn,
|
|
||||||
context.index,
|
|
||||||
context.db_fields_ids_map,
|
|
||||||
)?,
|
|
||||||
context.new_fields_ids_map,
|
|
||||||
&context.doc_alloc,
|
|
||||||
)?;
|
|
||||||
if new_rendered != old_rendered {
|
|
||||||
chunks.push(update.docid(), new_rendered)?;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
DocumentChange::Insertion(insertion) => {
|
DocumentChange::Insertion(insertion) => {
|
||||||
// if no inserted vectors, then regenerate: true + no embeddings => autogenerate
|
for chunks in &mut all_chunks {
|
||||||
let new_vectors = insertion.inserted_vectors(&context.doc_alloc)?;
|
let embedder_name = chunks.embedder_name();
|
||||||
if let Some(new_vectors) = new_vectors.as_ref().and_then(|new_vectors| {
|
let prompt = chunks.prompt();
|
||||||
new_vectors.vectors_for_key(embedder_name).transpose()
|
// if no inserted vectors, then regenerate: true + no embeddings => autogenerate
|
||||||
}) {
|
let new_vectors = insertion.inserted_vectors(&context.doc_alloc)?;
|
||||||
let new_vectors = new_vectors?;
|
if let Some(new_vectors) = new_vectors.as_ref().and_then(|new_vectors| {
|
||||||
self.sender
|
new_vectors.vectors_for_key(embedder_name).transpose()
|
||||||
.set_user_provided(
|
}) {
|
||||||
insertion.docid(),
|
let new_vectors = new_vectors?;
|
||||||
self.embedder_id,
|
chunks.set_regenerate(insertion.docid(), new_vectors.regenerate);
|
||||||
!new_vectors.regenerate,
|
if let Some(embeddings) = new_vectors.embeddings {
|
||||||
)
|
chunks.set_vectors(
|
||||||
.unwrap();
|
|
||||||
if let Some(embeddings) = new_vectors.embeddings {
|
|
||||||
self.sender
|
|
||||||
.set_vectors(
|
|
||||||
insertion.docid(),
|
insertion.docid(),
|
||||||
self.embedder_id,
|
|
||||||
embeddings.into_vec().map_err(UserError::SerdeJson)?,
|
embeddings.into_vec().map_err(UserError::SerdeJson)?,
|
||||||
)
|
);
|
||||||
.unwrap();
|
} else if new_vectors.regenerate {
|
||||||
} else if new_vectors.regenerate {
|
let rendered = prompt.render_document(
|
||||||
|
insertion.inserted(),
|
||||||
|
context.new_fields_ids_map,
|
||||||
|
&context.doc_alloc,
|
||||||
|
)?;
|
||||||
|
chunks.set_autogenerated(insertion.docid(), rendered)?;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
let rendered = prompt.render_document(
|
let rendered = prompt.render_document(
|
||||||
insertion.inserted(),
|
insertion.inserted(),
|
||||||
context.new_fields_ids_map,
|
context.new_fields_ids_map,
|
||||||
&context.doc_alloc,
|
&context.doc_alloc,
|
||||||
)?;
|
)?;
|
||||||
chunks.push(insertion.docid(), rendered)?;
|
chunks.set_autogenerated(insertion.docid(), rendered)?;
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
let rendered = prompt.render_document(
|
|
||||||
insertion.inserted(),
|
|
||||||
context.new_fields_ids_map,
|
|
||||||
&context.doc_alloc,
|
|
||||||
)?;
|
|
||||||
chunks.push(insertion.docid(), rendered)?;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
chunks.drain()
|
for chunk in all_chunks {
|
||||||
|
chunk.drain()?;
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
use bumpalo::collections::Vec as BVec;
|
|
||||||
use bumpalo::Bump;
|
|
||||||
|
|
||||||
// **Warning**: the destructor of this struct is not normally run, make sure that all its fields:
|
// **Warning**: the destructor of this struct is not normally run, make sure that all its fields:
|
||||||
// 1. don't have side effects tied to they destructors
|
// 1. don't have side effects tied to they destructors
|
||||||
// 2. if allocated, are allocated inside of the bumpalo
|
// 2. if allocated, are allocated inside of the bumpalo
|
||||||
@ -199,15 +214,21 @@ struct Chunks<'a> {
|
|||||||
embedder: &'a Embedder,
|
embedder: &'a Embedder,
|
||||||
embedder_id: u8,
|
embedder_id: u8,
|
||||||
embedder_name: &'a str,
|
embedder_name: &'a str,
|
||||||
|
prompt: &'a Prompt,
|
||||||
|
|
||||||
|
user_provided: &'a RefCell<HashMap<String, DelAddRoaringBitmap>>,
|
||||||
threads: &'a ThreadPoolNoAbort,
|
threads: &'a ThreadPoolNoAbort,
|
||||||
sender: &'a EmbeddingSender<'a>,
|
sender: &'a EmbeddingSender<'a>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> Chunks<'a> {
|
impl<'a> Chunks<'a> {
|
||||||
|
#[allow(clippy::too_many_arguments)]
|
||||||
pub fn new(
|
pub fn new(
|
||||||
embedder: &'a Embedder,
|
embedder: &'a Embedder,
|
||||||
embedder_id: u8,
|
embedder_id: u8,
|
||||||
embedder_name: &'a str,
|
embedder_name: &'a str,
|
||||||
|
prompt: &'a Prompt,
|
||||||
|
user_provided: &'a RefCell<HashMap<String, DelAddRoaringBitmap>>,
|
||||||
threads: &'a ThreadPoolNoAbort,
|
threads: &'a ThreadPoolNoAbort,
|
||||||
sender: &'a EmbeddingSender<'a>,
|
sender: &'a EmbeddingSender<'a>,
|
||||||
doc_alloc: &'a Bump,
|
doc_alloc: &'a Bump,
|
||||||
@ -215,10 +236,20 @@ impl<'a> Chunks<'a> {
|
|||||||
let capacity = embedder.prompt_count_in_chunk_hint() * embedder.chunk_count_hint();
|
let capacity = embedder.prompt_count_in_chunk_hint() * embedder.chunk_count_hint();
|
||||||
let texts = BVec::with_capacity_in(capacity, doc_alloc);
|
let texts = BVec::with_capacity_in(capacity, doc_alloc);
|
||||||
let ids = BVec::with_capacity_in(capacity, doc_alloc);
|
let ids = BVec::with_capacity_in(capacity, doc_alloc);
|
||||||
Self { texts, ids, embedder, threads, sender, embedder_id, embedder_name }
|
Self {
|
||||||
|
texts,
|
||||||
|
ids,
|
||||||
|
embedder,
|
||||||
|
prompt,
|
||||||
|
threads,
|
||||||
|
sender,
|
||||||
|
embedder_id,
|
||||||
|
embedder_name,
|
||||||
|
user_provided,
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn push(&mut self, docid: DocumentId, rendered: &'a str) -> Result<()> {
|
pub fn set_autogenerated(&mut self, docid: DocumentId, rendered: &'a str) -> Result<()> {
|
||||||
if self.texts.len() < self.texts.capacity() {
|
if self.texts.len() < self.texts.capacity() {
|
||||||
self.texts.push(rendered);
|
self.texts.push(rendered);
|
||||||
self.ids.push(docid);
|
self.ids.push(docid);
|
||||||
@ -316,4 +347,28 @@ impl<'a> Chunks<'a> {
|
|||||||
ids.clear();
|
ids.clear();
|
||||||
res
|
res
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn prompt(&self) -> &'a Prompt {
|
||||||
|
self.prompt
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn embedder_name(&self) -> &'a str {
|
||||||
|
self.embedder_name
|
||||||
|
}
|
||||||
|
|
||||||
|
fn set_regenerate(&self, docid: DocumentId, regenerate: bool) {
|
||||||
|
let mut user_provided = self.user_provided.borrow_mut();
|
||||||
|
let user_provided =
|
||||||
|
user_provided.entry_ref(self.embedder_name).or_insert(Default::default());
|
||||||
|
if regenerate {
|
||||||
|
// regenerate == !user_provided
|
||||||
|
user_provided.del.get_or_insert(Default::default()).insert(docid);
|
||||||
|
} else {
|
||||||
|
user_provided.add.get_or_insert(Default::default()).insert(docid);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn set_vectors(&self, docid: DocumentId, embeddings: Vec<Embedding>) {
|
||||||
|
self.sender.set_vectors(docid, self.embedder_id, embeddings).unwrap();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
use std::cell::RefCell;
|
use std::cell::RefCell;
|
||||||
use std::sync::RwLock;
|
use std::sync::{OnceLock, RwLock};
|
||||||
use std::thread::{self, Builder};
|
use std::thread::{self, Builder};
|
||||||
|
|
||||||
use big_s::S;
|
use big_s::S;
|
||||||
@ -10,9 +10,13 @@ use document_changes::{
|
|||||||
};
|
};
|
||||||
pub use document_deletion::DocumentDeletion;
|
pub use document_deletion::DocumentDeletion;
|
||||||
pub use document_operation::DocumentOperation;
|
pub use document_operation::DocumentOperation;
|
||||||
|
use hashbrown::HashMap;
|
||||||
use heed::{RoTxn, RwTxn};
|
use heed::{RoTxn, RwTxn};
|
||||||
|
use itertools::{EitherOrBoth, Itertools};
|
||||||
pub use partial_dump::PartialDump;
|
pub use partial_dump::PartialDump;
|
||||||
|
use rand::SeedableRng as _;
|
||||||
use rayon::ThreadPool;
|
use rayon::ThreadPool;
|
||||||
|
use roaring::RoaringBitmap;
|
||||||
use time::OffsetDateTime;
|
use time::OffsetDateTime;
|
||||||
pub use update_by_function::UpdateByFunction;
|
pub use update_by_function::UpdateByFunction;
|
||||||
|
|
||||||
@ -31,10 +35,15 @@ use crate::facet::FacetType;
|
|||||||
use crate::fields_ids_map::metadata::{FieldIdMapWithMetadata, MetadataBuilder};
|
use crate::fields_ids_map::metadata::{FieldIdMapWithMetadata, MetadataBuilder};
|
||||||
use crate::proximity::ProximityPrecision;
|
use crate::proximity::ProximityPrecision;
|
||||||
use crate::update::new::channel::ExtractorSender;
|
use crate::update::new::channel::ExtractorSender;
|
||||||
|
use crate::update::new::extract::EmbeddingExtractor;
|
||||||
use crate::update::new::words_prefix_docids::compute_exact_word_prefix_docids;
|
use crate::update::new::words_prefix_docids::compute_exact_word_prefix_docids;
|
||||||
use crate::update::settings::InnerIndexSettings;
|
use crate::update::settings::InnerIndexSettings;
|
||||||
use crate::update::{FacetsUpdateBulk, GrenadParameters};
|
use crate::update::{FacetsUpdateBulk, GrenadParameters};
|
||||||
use crate::{FieldsIdsMap, GlobalFieldsIdsMap, Index, Result, UserError};
|
use crate::vector::{ArroyWrapper, EmbeddingConfigs};
|
||||||
|
use crate::{
|
||||||
|
FieldsIdsMap, GlobalFieldsIdsMap, Index, InternalError, Result, ThreadPoolNoAbort,
|
||||||
|
ThreadPoolNoAbortBuilder, UserError,
|
||||||
|
};
|
||||||
|
|
||||||
pub(crate) mod de;
|
pub(crate) mod de;
|
||||||
pub mod document_changes;
|
pub mod document_changes;
|
||||||
@ -119,6 +128,7 @@ impl<'a, 'extractor> Extractor<'extractor> for DocumentExtractor<'a> {
|
|||||||
/// Give it the output of the [`Indexer::document_changes`] method and it will execute it in the [`rayon::ThreadPool`].
|
/// Give it the output of the [`Indexer::document_changes`] method and it will execute it in the [`rayon::ThreadPool`].
|
||||||
///
|
///
|
||||||
/// TODO return stats
|
/// TODO return stats
|
||||||
|
#[allow(clippy::too_many_arguments)] // clippy: 😝
|
||||||
pub fn index<'pl, 'indexer, 'index, DC>(
|
pub fn index<'pl, 'indexer, 'index, DC>(
|
||||||
wtxn: &mut RwTxn,
|
wtxn: &mut RwTxn,
|
||||||
index: &'index Index,
|
index: &'index Index,
|
||||||
@ -127,6 +137,7 @@ pub fn index<'pl, 'indexer, 'index, DC>(
|
|||||||
new_primary_key: Option<PrimaryKey<'pl>>,
|
new_primary_key: Option<PrimaryKey<'pl>>,
|
||||||
pool: &ThreadPool,
|
pool: &ThreadPool,
|
||||||
document_changes: &DC,
|
document_changes: &DC,
|
||||||
|
embedders: EmbeddingConfigs,
|
||||||
) -> Result<()>
|
) -> Result<()>
|
||||||
where
|
where
|
||||||
DC: DocumentChanges<'pl>,
|
DC: DocumentChanges<'pl>,
|
||||||
@ -153,8 +164,9 @@ where
|
|||||||
fields_ids_map_store: &fields_ids_map_store,
|
fields_ids_map_store: &fields_ids_map_store,
|
||||||
};
|
};
|
||||||
|
|
||||||
thread::scope(|s| {
|
thread::scope(|s| -> Result<()> {
|
||||||
let indexer_span = tracing::Span::current();
|
let indexer_span = tracing::Span::current();
|
||||||
|
let embedders = &embedders;
|
||||||
// TODO manage the errors correctly
|
// TODO manage the errors correctly
|
||||||
let handle = Builder::new().name(S("indexer-extractors")).spawn_scoped(s, move || {
|
let handle = Builder::new().name(S("indexer-extractors")).spawn_scoped(s, move || {
|
||||||
pool.in_place_scope(|_s| {
|
pool.in_place_scope(|_s| {
|
||||||
@ -238,9 +250,29 @@ where
|
|||||||
if index_embeddings.is_empty() {
|
if index_embeddings.is_empty() {
|
||||||
break 'vectors;
|
break 'vectors;
|
||||||
}
|
}
|
||||||
for index_embedding in index_embeddings {
|
/// FIXME: need access to `merger_sender`
|
||||||
|
let embedding_sender = todo!();
|
||||||
|
let extractor = EmbeddingExtractor::new(&embedders, &embedding_sender, request_threads());
|
||||||
|
let datastore = ThreadLocal::with_capacity(pool.current_num_threads());
|
||||||
|
|
||||||
|
for_each_document_change(document_changes, &extractor, indexing_context, &mut extractor_allocs, &datastore)?;
|
||||||
|
|
||||||
|
|
||||||
|
let mut user_provided = HashMap::new();
|
||||||
|
for data in datastore {
|
||||||
|
let data = data.0.into_inner();
|
||||||
|
for (embedder, deladd) in data.into_iter() {
|
||||||
|
let user_provided = user_provided.entry(embedder).or_insert(Default::default());
|
||||||
|
if let Some(del) = deladd.del {
|
||||||
|
*user_provided -= del;
|
||||||
|
}
|
||||||
|
if let Some(add) = deladd.add {
|
||||||
|
*user_provided |= add;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
embedding_sender.finish(user_provided).unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
@ -285,15 +317,137 @@ where
|
|||||||
)
|
)
|
||||||
})?;
|
})?;
|
||||||
|
|
||||||
|
let vector_arroy = index.vector_arroy;
|
||||||
|
let mut rng = rand::rngs::StdRng::seed_from_u64(42);
|
||||||
|
let indexer_span = tracing::Span::current();
|
||||||
|
let arroy_writers: Result<HashMap<_, _>> = embedders
|
||||||
|
.inner_as_ref()
|
||||||
|
.iter()
|
||||||
|
.map(|(embedder_name, (embedder, _, was_quantized))| {
|
||||||
|
let embedder_index = index.embedder_category_id.get(wtxn, &embedder_name)?.ok_or(
|
||||||
|
InternalError::DatabaseMissingEntry {
|
||||||
|
db_name: "embedder_category_id",
|
||||||
|
key: None,
|
||||||
|
},
|
||||||
|
)?;
|
||||||
|
|
||||||
|
let dimensions = embedder.dimensions();
|
||||||
|
|
||||||
|
let writers: Vec<_> = crate::vector::arroy_db_range_for_embedder(embedder_index)
|
||||||
|
.map(|k| ArroyWrapper::new(vector_arroy, k, *was_quantized))
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
Ok((
|
||||||
|
embedder_index,
|
||||||
|
(embedder_name.as_str(), embedder.as_ref(), writers, dimensions),
|
||||||
|
))
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
let mut arroy_writers = arroy_writers?;
|
||||||
for operation in writer_receiver {
|
for operation in writer_receiver {
|
||||||
let database = operation.database(index);
|
match operation {
|
||||||
match operation.entry() {
|
WriterOperation::DbOperation(db_operation) => {
|
||||||
EntryOperation::Delete(e) => {
|
let database = db_operation.database(index);
|
||||||
if !database.delete(wtxn, e.entry())? {
|
match db_operation.entry() {
|
||||||
unreachable!("We tried to delete an unknown key")
|
EntryOperation::Delete(e) => {
|
||||||
|
if !database.delete(wtxn, e.entry())? {
|
||||||
|
unreachable!("We tried to delete an unknown key")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
EntryOperation::Write(e) => database.put(wtxn, e.key(), e.value())?,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
EntryOperation::Write(e) => database.put(wtxn, e.key(), e.value())?,
|
WriterOperation::ArroyOperation(arroy_operation) => match arroy_operation {
|
||||||
|
ArroyOperation::DeleteVectors { docid } => {
|
||||||
|
for (_embedder_index, (_embedder_name, _embedder, writers, dimensions)) in
|
||||||
|
&mut arroy_writers
|
||||||
|
{
|
||||||
|
let dimensions = *dimensions;
|
||||||
|
for writer in writers {
|
||||||
|
// Uses invariant: vectors are packed in the first writers.
|
||||||
|
if !writer.del_item(wtxn, dimensions, docid)? {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ArroyOperation::SetVectors { docid, embedder_id, embeddings } => {
|
||||||
|
let (_, _, writers, dimensions) =
|
||||||
|
arroy_writers.get(&embedder_id).expect("requested a missing embedder");
|
||||||
|
for res in writers.iter().zip_longest(&embeddings) {
|
||||||
|
match res {
|
||||||
|
EitherOrBoth::Both(writer, embedding) => {
|
||||||
|
writer.add_item(wtxn, *dimensions, docid, embedding)?;
|
||||||
|
}
|
||||||
|
EitherOrBoth::Left(writer) => {
|
||||||
|
let deleted = writer.del_item(wtxn, *dimensions, docid)?;
|
||||||
|
if !deleted {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
EitherOrBoth::Right(_embedding) => {
|
||||||
|
let external_document_id = index
|
||||||
|
.external_id_of(wtxn, std::iter::once(docid))?
|
||||||
|
.into_iter()
|
||||||
|
.next()
|
||||||
|
.unwrap()?;
|
||||||
|
return Err(UserError::TooManyVectors(
|
||||||
|
external_document_id,
|
||||||
|
embeddings.len(),
|
||||||
|
)
|
||||||
|
.into());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ArroyOperation::SetVector { docid, embedder_id, embedding } => {
|
||||||
|
let (_, _, writers, dimensions) =
|
||||||
|
arroy_writers.get(&embedder_id).expect("requested a missing embedder");
|
||||||
|
for res in writers.iter().zip_longest(std::iter::once(&embedding)) {
|
||||||
|
match res {
|
||||||
|
EitherOrBoth::Both(writer, embedding) => {
|
||||||
|
writer.add_item(wtxn, *dimensions, docid, embedding)?;
|
||||||
|
}
|
||||||
|
EitherOrBoth::Left(writer) => {
|
||||||
|
let deleted = writer.del_item(wtxn, *dimensions, docid)?;
|
||||||
|
if !deleted {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
EitherOrBoth::Right(_embedding) => {
|
||||||
|
unreachable!("1 vs 256 vectors")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ArroyOperation::Finish { mut user_provided } => {
|
||||||
|
let span = tracing::trace_span!(target: "indexing::vectors", parent: &indexer_span, "build");
|
||||||
|
let _entered = span.enter();
|
||||||
|
for (_embedder_index, (_embedder_name, _embedder, writers, dimensions)) in
|
||||||
|
&mut arroy_writers
|
||||||
|
{
|
||||||
|
let dimensions = *dimensions;
|
||||||
|
for writer in writers {
|
||||||
|
if writer.need_build(wtxn, dimensions)? {
|
||||||
|
writer.build(wtxn, &mut rng, dimensions)?;
|
||||||
|
} else if writer.is_empty(wtxn, dimensions)? {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut configs = index.embedding_configs(wtxn)?;
|
||||||
|
|
||||||
|
for config in &mut configs {
|
||||||
|
if let Some(user_provided) = user_provided.remove(&config.name) {
|
||||||
|
config.user_provided = user_provided;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
index.put_embedding_configs(wtxn, configs)?;
|
||||||
|
}
|
||||||
|
},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -483,3 +637,15 @@ pub fn retrieve_or_guess_primary_key<'a>(
|
|||||||
Err(err) => Ok(Err(err)),
|
Err(err) => Ok(Err(err)),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn request_threads() -> &'static ThreadPoolNoAbort {
|
||||||
|
static REQUEST_THREADS: OnceLock<ThreadPoolNoAbort> = OnceLock::new();
|
||||||
|
|
||||||
|
REQUEST_THREADS.get_or_init(|| {
|
||||||
|
ThreadPoolNoAbortBuilder::new()
|
||||||
|
.num_threads(crate::vector::REQUEST_PARALLELISM)
|
||||||
|
.thread_name(|index| format!("embedding-request-{index}"))
|
||||||
|
.build()
|
||||||
|
.unwrap()
|
||||||
|
})
|
||||||
|
}
|
||||||
|
@ -149,6 +149,7 @@ pub fn merge_grenad_entries(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
MergerOperation::DeleteDocument { docid, external_id } => {
|
MergerOperation::DeleteDocument { docid, external_id } => {
|
||||||
|
/// TODO: delete vectors
|
||||||
let span =
|
let span =
|
||||||
tracing::trace_span!(target: "indexing::documents::merge", "delete_document");
|
tracing::trace_span!(target: "indexing::documents::merge", "delete_document");
|
||||||
let _entered = span.enter();
|
let _entered = span.enter();
|
||||||
|
Loading…
Reference in New Issue
Block a user