diff --git a/crates/milli/src/vector/composite.rs b/crates/milli/src/vector/composite.rs new file mode 100644 index 000000000..d174232bf --- /dev/null +++ b/crates/milli/src/vector/composite.rs @@ -0,0 +1,280 @@ +use std::time::Instant; + +use arroy::Distance; + +use super::error::CompositeEmbedderContainsHuggingFace; +use super::{ + hf, manual, ollama, openai, rest, DistributionShift, EmbedError, Embedding, NewEmbedderError, +}; +use crate::ThreadPoolNoAbort; + +#[derive(Debug)] +pub enum SubEmbedder { + /// An embedder based on running local models, fetched from the Hugging Face Hub. + HuggingFace(hf::Embedder), + /// An embedder based on making embedding queries against the OpenAI API. + OpenAi(openai::Embedder), + /// An embedder based on the user providing the embeddings in the documents and queries. + UserProvided(manual::Embedder), + /// An embedder based on making embedding queries against an embedding server. + Ollama(ollama::Embedder), + /// An embedder based on making embedding queries against a generic JSON/REST embedding server. + Rest(rest::Embedder), +} + +#[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)] +pub enum SubEmbedderOptions { + HuggingFace(hf::EmbedderOptions), + OpenAi(openai::EmbedderOptions), + Ollama(ollama::EmbedderOptions), + UserProvided(manual::EmbedderOptions), + Rest(rest::EmbedderOptions), +} + +impl SubEmbedderOptions { + pub fn distribution(&self) -> Option { + match self { + SubEmbedderOptions::HuggingFace(embedder_options) => embedder_options.distribution, + SubEmbedderOptions::OpenAi(embedder_options) => embedder_options.distribution, + SubEmbedderOptions::Ollama(embedder_options) => embedder_options.distribution, + SubEmbedderOptions::UserProvided(embedder_options) => embedder_options.distribution, + SubEmbedderOptions::Rest(embedder_options) => embedder_options.distribution, + } + } +} + +#[derive(Debug)] +pub struct Embedder { + pub(super) search: SubEmbedder, + pub(super) index: SubEmbedder, +} + +#[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)] +pub struct EmbedderOptions { + pub search: SubEmbedderOptions, + pub index: SubEmbedderOptions, +} + +impl Embedder { + pub fn new( + EmbedderOptions { search, index }: EmbedderOptions, + ) -> Result { + let search = SubEmbedder::new(search)?; + let index = SubEmbedder::new(index)?; + + // check dimensions + if search.dimensions() != index.dimensions() { + return Err(NewEmbedderError::composite_dimensions_mismatch( + search.dimensions(), + index.dimensions(), + )); + } + // check similarity + let search_embeddings = search + .embed( + vec![ + "test".into(), + "a brave dog".into(), + "This is a sample text. It is meant to compare similarity.".into(), + ], + None, + ) + .map_err(|error| NewEmbedderError::composite_test_embedding_failed(error, "search"))?; + + let index_embeddings = index + .embed( + vec![ + "test".into(), + "a brave dog".into(), + "This is a sample text. It is meant to compare similarity.".into(), + ], + None, + ) + .map_err(|error| { + NewEmbedderError::composite_test_embedding_failed(error, "indexing") + })?; + + let hint = configuration_hint(&search, &index); + + check_similarity(search_embeddings, index_embeddings, hint)?; + + Ok(Self { search, index }) + } + + /// Indicates the dimensions of a single embedding produced by the embedder. + pub fn dimensions(&self) -> usize { + // can use the dimensions of any embedder since they should match + self.index.dimensions() + } + + /// An optional distribution used to apply an affine transformation to the similarity score of a document. + pub fn distribution(&self) -> Option { + // 3 cases here: + // 1. distribution provided by user => use that one, which was stored in search + // 2. no user-provided distribution, distribution in search embedder => use that one + // 2. no user-provided distribution, no distribution in search embedder => use the distribution in indexing embedder + self.search.distribution().or_else(|| self.index.distribution()) + } +} + +impl SubEmbedder { + pub fn new(options: SubEmbedderOptions) -> std::result::Result { + Ok(match options { + SubEmbedderOptions::HuggingFace(options) => { + Self::HuggingFace(hf::Embedder::new(options)?) + } + SubEmbedderOptions::OpenAi(options) => Self::OpenAi(openai::Embedder::new(options)?), + SubEmbedderOptions::Ollama(options) => Self::Ollama(ollama::Embedder::new(options)?), + SubEmbedderOptions::UserProvided(options) => { + Self::UserProvided(manual::Embedder::new(options)) + } + SubEmbedderOptions::Rest(options) => { + Self::Rest(rest::Embedder::new(options, rest::ConfigurationSource::User)?) + } + }) + } + + pub fn embed( + &self, + texts: Vec, + deadline: Option, + ) -> std::result::Result, EmbedError> { + match self { + SubEmbedder::HuggingFace(embedder) => embedder.embed(texts), + SubEmbedder::OpenAi(embedder) => embedder.embed(&texts, deadline), + SubEmbedder::Ollama(embedder) => embedder.embed(&texts, deadline), + SubEmbedder::UserProvided(embedder) => embedder.embed(&texts), + SubEmbedder::Rest(embedder) => embedder.embed(texts, deadline), + } + } + + /// Embed multiple chunks of texts. + /// + /// Each chunk is composed of one or multiple texts. + pub fn embed_index( + &self, + text_chunks: Vec>, + threads: &ThreadPoolNoAbort, + ) -> std::result::Result>, EmbedError> { + match self { + SubEmbedder::HuggingFace(embedder) => embedder.embed_index(text_chunks), + SubEmbedder::OpenAi(embedder) => embedder.embed_index(text_chunks, threads), + SubEmbedder::Ollama(embedder) => embedder.embed_index(text_chunks, threads), + SubEmbedder::UserProvided(embedder) => embedder.embed_index(text_chunks), + SubEmbedder::Rest(embedder) => embedder.embed_index(text_chunks, threads), + } + } + + /// Non-owning variant of [`Self::embed_index`]. + pub fn embed_index_ref( + &self, + texts: &[&str], + threads: &ThreadPoolNoAbort, + ) -> std::result::Result, EmbedError> { + match self { + SubEmbedder::HuggingFace(embedder) => embedder.embed_index_ref(texts), + SubEmbedder::OpenAi(embedder) => embedder.embed_index_ref(texts, threads), + SubEmbedder::Ollama(embedder) => embedder.embed_index_ref(texts, threads), + SubEmbedder::UserProvided(embedder) => embedder.embed_index_ref(texts), + SubEmbedder::Rest(embedder) => embedder.embed_index_ref(texts, threads), + } + } + + /// Indicates the preferred number of chunks to pass to [`Self::embed_chunks`] + pub fn chunk_count_hint(&self) -> usize { + match self { + SubEmbedder::HuggingFace(embedder) => embedder.chunk_count_hint(), + SubEmbedder::OpenAi(embedder) => embedder.chunk_count_hint(), + SubEmbedder::Ollama(embedder) => embedder.chunk_count_hint(), + SubEmbedder::UserProvided(_) => 100, + SubEmbedder::Rest(embedder) => embedder.chunk_count_hint(), + } + } + + /// Indicates the preferred number of texts in a single chunk passed to [`Self::embed`] + pub fn prompt_count_in_chunk_hint(&self) -> usize { + match self { + SubEmbedder::HuggingFace(embedder) => embedder.prompt_count_in_chunk_hint(), + SubEmbedder::OpenAi(embedder) => embedder.prompt_count_in_chunk_hint(), + SubEmbedder::Ollama(embedder) => embedder.prompt_count_in_chunk_hint(), + SubEmbedder::UserProvided(_) => 1, + SubEmbedder::Rest(embedder) => embedder.prompt_count_in_chunk_hint(), + } + } + + pub fn uses_document_template(&self) -> bool { + match self { + SubEmbedder::HuggingFace(_) + | SubEmbedder::OpenAi(_) + | SubEmbedder::Ollama(_) + | SubEmbedder::Rest(_) => true, + SubEmbedder::UserProvided(_) => false, + } + } + + /// Indicates the dimensions of a single embedding produced by the embedder. + pub fn dimensions(&self) -> usize { + match self { + SubEmbedder::HuggingFace(embedder) => embedder.dimensions(), + SubEmbedder::OpenAi(embedder) => embedder.dimensions(), + SubEmbedder::Ollama(embedder) => embedder.dimensions(), + SubEmbedder::UserProvided(embedder) => embedder.dimensions(), + SubEmbedder::Rest(embedder) => embedder.dimensions(), + } + } + + /// An optional distribution used to apply an affine transformation to the similarity score of a document. + pub fn distribution(&self) -> Option { + match self { + SubEmbedder::HuggingFace(embedder) => embedder.distribution(), + SubEmbedder::OpenAi(embedder) => embedder.distribution(), + SubEmbedder::Ollama(embedder) => embedder.distribution(), + SubEmbedder::UserProvided(embedder) => embedder.distribution(), + SubEmbedder::Rest(embedder) => embedder.distribution(), + } + } +} + +fn check_similarity( + left: Vec, + right: Vec, + hint: CompositeEmbedderContainsHuggingFace, +) -> Result<(), NewEmbedderError> { + if left.len() != right.len() { + return Err(NewEmbedderError::composite_embedding_count_mismatch(left.len(), right.len())); + } + + for (left, right) in left.into_iter().zip(right) { + let left = arroy::internals::UnalignedVector::from_slice(&left); + let right = arroy::internals::UnalignedVector::from_slice(&right); + let left = arroy::internals::Leaf { + header: arroy::distances::Cosine::new_header(&left), + vector: left, + }; + let right = arroy::internals::Leaf { + header: arroy::distances::Cosine::new_header(&right), + vector: right, + }; + + let distance = arroy::distances::Cosine::built_distance(&left, &right); + + if distance > super::MAX_COMPOSITE_DISTANCE { + return Err(NewEmbedderError::composite_embedding_value_mismatch(distance, hint)); + } + } + Ok(()) +} + +fn configuration_hint( + search: &SubEmbedder, + index: &SubEmbedder, +) -> CompositeEmbedderContainsHuggingFace { + match (search, index) { + (SubEmbedder::HuggingFace(_), SubEmbedder::HuggingFace(_)) => { + CompositeEmbedderContainsHuggingFace::Both + } + (SubEmbedder::HuggingFace(_), _) => CompositeEmbedderContainsHuggingFace::Search, + (_, SubEmbedder::HuggingFace(_)) => CompositeEmbedderContainsHuggingFace::Indexing, + _ => CompositeEmbedderContainsHuggingFace::None, + } +} diff --git a/crates/milli/src/vector/error.rs b/crates/milli/src/vector/error.rs index 650249bff..0993ded1d 100644 --- a/crates/milli/src/vector/error.rs +++ b/crates/milli/src/vector/error.rs @@ -6,6 +6,7 @@ use hf_hub::api::sync::ApiError; use super::parsed_vectors::ParsedVectorsDiff; use super::rest::ConfigurationSource; +use super::MAX_COMPOSITE_DISTANCE; use crate::error::FaultSource; use crate::update::new::vector_document::VectorDocument; use crate::{FieldDistribution, PanicCatched}; @@ -335,6 +336,77 @@ impl NewEmbedderError { pub(crate) fn ollama_unsupported_url(url: String) -> NewEmbedderError { Self { kind: NewEmbedderErrorKind::OllamaUnsupportedUrl(url), fault: FaultSource::User } } + + pub(crate) fn composite_dimensions_mismatch( + search_dimensions: usize, + index_dimensions: usize, + ) -> NewEmbedderError { + Self { + kind: NewEmbedderErrorKind::CompositeDimensionsMismatch { + search_dimensions, + index_dimensions, + }, + fault: FaultSource::User, + } + } + + pub(crate) fn composite_test_embedding_failed( + inner: EmbedError, + failing_embedder: &'static str, + ) -> NewEmbedderError { + Self { + kind: NewEmbedderErrorKind::CompositeTestEmbeddingFailed { inner, failing_embedder }, + fault: FaultSource::Runtime, + } + } + + pub(crate) fn composite_embedding_count_mismatch( + search_count: usize, + index_count: usize, + ) -> NewEmbedderError { + Self { + kind: NewEmbedderErrorKind::CompositeEmbeddingCountMismatch { + search_count, + index_count, + }, + fault: FaultSource::Runtime, + } + } + + pub(crate) fn composite_embedding_value_mismatch( + distance: f32, + hint: CompositeEmbedderContainsHuggingFace, + ) -> NewEmbedderError { + Self { + kind: NewEmbedderErrorKind::CompositeEmbeddingValueMismatch { distance, hint }, + fault: FaultSource::User, + } + } +} + +#[derive(Debug, Clone, Copy)] +pub enum CompositeEmbedderContainsHuggingFace { + Both, + Search, + Indexing, + None, +} + +impl std::fmt::Display for CompositeEmbedderContainsHuggingFace { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + CompositeEmbedderContainsHuggingFace::Both => f.write_str( + "\n - Make sure the `model`, `revision` and `pooling` of both embedders match.", + ), + CompositeEmbedderContainsHuggingFace::Search => f.write_str( + "\n - Consider trying a different `pooling` method for the search embedder.", + ), + CompositeEmbedderContainsHuggingFace::Indexing => f.write_str( + "\n - Consider trying a different `pooling` method for the indexing embedder.", + ), + CompositeEmbedderContainsHuggingFace::None => Ok(()), + } + } } #[derive(Debug, thiserror::Error)] @@ -419,6 +491,14 @@ pub enum NewEmbedderErrorKind { CouldNotParseTemplate(String), #[error("unsupported Ollama URL.\n - For `ollama` sources, the URL must end with `/api/embed` or `/api/embeddings`\n - Got `{0}`")] OllamaUnsupportedUrl(String), + #[error("error while generating test embeddings.\n - the dimensions of embeddings produced at search time and at indexing time don't match.\n - Search time dimensions: {search_dimensions}\n - Indexing time dimensions: {index_dimensions}\n - Note: Dimensions of embeddings produced by both embedders are required to match.")] + CompositeDimensionsMismatch { search_dimensions: usize, index_dimensions: usize }, + #[error("error while generating test embeddings.\n - could not generate test embedding with embedder at {failing_embedder} time.\n - Embedding failed with {inner}")] + CompositeTestEmbeddingFailed { inner: EmbedError, failing_embedder: &'static str }, + #[error("error while generating test embeddings.\n - the number of generated embeddings differs.\n - {search_count} embeddings for the search time embedder.\n - {index_count} embeddings for the indexing time embedder.")] + CompositeEmbeddingCountMismatch { search_count: usize, index_count: usize }, + #[error("error while generating test embeddings.\n - the embeddings produced at search time and indexing time are not similar enough.\n - angular distance {distance}\n - Meilisearch requires a maximum distance of {MAX_COMPOSITE_DISTANCE}.\n - Note: check that both embedders produce similar embeddings.{hint}")] + CompositeEmbeddingValueMismatch { distance: f32, hint: CompositeEmbedderContainsHuggingFace }, } pub struct PossibleEmbeddingMistakes { diff --git a/crates/milli/src/vector/mod.rs b/crates/milli/src/vector/mod.rs index 74b52b1fe..47307295e 100644 --- a/crates/milli/src/vector/mod.rs +++ b/crates/milli/src/vector/mod.rs @@ -15,6 +15,7 @@ use self::error::{EmbedError, NewEmbedderError}; use crate::prompt::{Prompt, PromptData}; use crate::ThreadPoolNoAbort; +pub mod composite; pub mod error; pub mod hf; pub mod json_template; @@ -31,6 +32,7 @@ pub use self::error::Error; pub type Embedding = Vec; pub const REQUEST_PARALLELISM: usize = 40; +pub const MAX_COMPOSITE_DISTANCE: f32 = 0.01; pub struct ArroyWrapper { quantized: bool,