mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-03-06 22:02:34 +08:00
Add composite embedder
This commit is contained in:
parent
526476e168
commit
8d2d9066ba
280
crates/milli/src/vector/composite.rs
Normal file
280
crates/milli/src/vector/composite.rs
Normal file
@ -0,0 +1,280 @@
|
||||
use std::time::Instant;
|
||||
|
||||
use arroy::Distance;
|
||||
|
||||
use super::error::CompositeEmbedderContainsHuggingFace;
|
||||
use super::{
|
||||
hf, manual, ollama, openai, rest, DistributionShift, EmbedError, Embedding, NewEmbedderError,
|
||||
};
|
||||
use crate::ThreadPoolNoAbort;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum SubEmbedder {
|
||||
/// An embedder based on running local models, fetched from the Hugging Face Hub.
|
||||
HuggingFace(hf::Embedder),
|
||||
/// An embedder based on making embedding queries against the OpenAI API.
|
||||
OpenAi(openai::Embedder),
|
||||
/// An embedder based on the user providing the embeddings in the documents and queries.
|
||||
UserProvided(manual::Embedder),
|
||||
/// An embedder based on making embedding queries against an <https://ollama.com> embedding server.
|
||||
Ollama(ollama::Embedder),
|
||||
/// An embedder based on making embedding queries against a generic JSON/REST embedding server.
|
||||
Rest(rest::Embedder),
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)]
|
||||
pub enum SubEmbedderOptions {
|
||||
HuggingFace(hf::EmbedderOptions),
|
||||
OpenAi(openai::EmbedderOptions),
|
||||
Ollama(ollama::EmbedderOptions),
|
||||
UserProvided(manual::EmbedderOptions),
|
||||
Rest(rest::EmbedderOptions),
|
||||
}
|
||||
|
||||
impl SubEmbedderOptions {
|
||||
pub fn distribution(&self) -> Option<DistributionShift> {
|
||||
match self {
|
||||
SubEmbedderOptions::HuggingFace(embedder_options) => embedder_options.distribution,
|
||||
SubEmbedderOptions::OpenAi(embedder_options) => embedder_options.distribution,
|
||||
SubEmbedderOptions::Ollama(embedder_options) => embedder_options.distribution,
|
||||
SubEmbedderOptions::UserProvided(embedder_options) => embedder_options.distribution,
|
||||
SubEmbedderOptions::Rest(embedder_options) => embedder_options.distribution,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct Embedder {
|
||||
pub(super) search: SubEmbedder,
|
||||
pub(super) index: SubEmbedder,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)]
|
||||
pub struct EmbedderOptions {
|
||||
pub search: SubEmbedderOptions,
|
||||
pub index: SubEmbedderOptions,
|
||||
}
|
||||
|
||||
impl Embedder {
|
||||
pub fn new(
|
||||
EmbedderOptions { search, index }: EmbedderOptions,
|
||||
) -> Result<Self, NewEmbedderError> {
|
||||
let search = SubEmbedder::new(search)?;
|
||||
let index = SubEmbedder::new(index)?;
|
||||
|
||||
// check dimensions
|
||||
if search.dimensions() != index.dimensions() {
|
||||
return Err(NewEmbedderError::composite_dimensions_mismatch(
|
||||
search.dimensions(),
|
||||
index.dimensions(),
|
||||
));
|
||||
}
|
||||
// check similarity
|
||||
let search_embeddings = search
|
||||
.embed(
|
||||
vec![
|
||||
"test".into(),
|
||||
"a brave dog".into(),
|
||||
"This is a sample text. It is meant to compare similarity.".into(),
|
||||
],
|
||||
None,
|
||||
)
|
||||
.map_err(|error| NewEmbedderError::composite_test_embedding_failed(error, "search"))?;
|
||||
|
||||
let index_embeddings = index
|
||||
.embed(
|
||||
vec![
|
||||
"test".into(),
|
||||
"a brave dog".into(),
|
||||
"This is a sample text. It is meant to compare similarity.".into(),
|
||||
],
|
||||
None,
|
||||
)
|
||||
.map_err(|error| {
|
||||
NewEmbedderError::composite_test_embedding_failed(error, "indexing")
|
||||
})?;
|
||||
|
||||
let hint = configuration_hint(&search, &index);
|
||||
|
||||
check_similarity(search_embeddings, index_embeddings, hint)?;
|
||||
|
||||
Ok(Self { search, index })
|
||||
}
|
||||
|
||||
/// Indicates the dimensions of a single embedding produced by the embedder.
|
||||
pub fn dimensions(&self) -> usize {
|
||||
// can use the dimensions of any embedder since they should match
|
||||
self.index.dimensions()
|
||||
}
|
||||
|
||||
/// An optional distribution used to apply an affine transformation to the similarity score of a document.
|
||||
pub fn distribution(&self) -> Option<DistributionShift> {
|
||||
// 3 cases here:
|
||||
// 1. distribution provided by user => use that one, which was stored in search
|
||||
// 2. no user-provided distribution, distribution in search embedder => use that one
|
||||
// 2. no user-provided distribution, no distribution in search embedder => use the distribution in indexing embedder
|
||||
self.search.distribution().or_else(|| self.index.distribution())
|
||||
}
|
||||
}
|
||||
|
||||
impl SubEmbedder {
|
||||
pub fn new(options: SubEmbedderOptions) -> std::result::Result<Self, NewEmbedderError> {
|
||||
Ok(match options {
|
||||
SubEmbedderOptions::HuggingFace(options) => {
|
||||
Self::HuggingFace(hf::Embedder::new(options)?)
|
||||
}
|
||||
SubEmbedderOptions::OpenAi(options) => Self::OpenAi(openai::Embedder::new(options)?),
|
||||
SubEmbedderOptions::Ollama(options) => Self::Ollama(ollama::Embedder::new(options)?),
|
||||
SubEmbedderOptions::UserProvided(options) => {
|
||||
Self::UserProvided(manual::Embedder::new(options))
|
||||
}
|
||||
SubEmbedderOptions::Rest(options) => {
|
||||
Self::Rest(rest::Embedder::new(options, rest::ConfigurationSource::User)?)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
pub fn embed(
|
||||
&self,
|
||||
texts: Vec<String>,
|
||||
deadline: Option<Instant>,
|
||||
) -> std::result::Result<Vec<Embedding>, EmbedError> {
|
||||
match self {
|
||||
SubEmbedder::HuggingFace(embedder) => embedder.embed(texts),
|
||||
SubEmbedder::OpenAi(embedder) => embedder.embed(&texts, deadline),
|
||||
SubEmbedder::Ollama(embedder) => embedder.embed(&texts, deadline),
|
||||
SubEmbedder::UserProvided(embedder) => embedder.embed(&texts),
|
||||
SubEmbedder::Rest(embedder) => embedder.embed(texts, deadline),
|
||||
}
|
||||
}
|
||||
|
||||
/// Embed multiple chunks of texts.
|
||||
///
|
||||
/// Each chunk is composed of one or multiple texts.
|
||||
pub fn embed_index(
|
||||
&self,
|
||||
text_chunks: Vec<Vec<String>>,
|
||||
threads: &ThreadPoolNoAbort,
|
||||
) -> std::result::Result<Vec<Vec<Embedding>>, EmbedError> {
|
||||
match self {
|
||||
SubEmbedder::HuggingFace(embedder) => embedder.embed_index(text_chunks),
|
||||
SubEmbedder::OpenAi(embedder) => embedder.embed_index(text_chunks, threads),
|
||||
SubEmbedder::Ollama(embedder) => embedder.embed_index(text_chunks, threads),
|
||||
SubEmbedder::UserProvided(embedder) => embedder.embed_index(text_chunks),
|
||||
SubEmbedder::Rest(embedder) => embedder.embed_index(text_chunks, threads),
|
||||
}
|
||||
}
|
||||
|
||||
/// Non-owning variant of [`Self::embed_index`].
|
||||
pub fn embed_index_ref(
|
||||
&self,
|
||||
texts: &[&str],
|
||||
threads: &ThreadPoolNoAbort,
|
||||
) -> std::result::Result<Vec<Embedding>, EmbedError> {
|
||||
match self {
|
||||
SubEmbedder::HuggingFace(embedder) => embedder.embed_index_ref(texts),
|
||||
SubEmbedder::OpenAi(embedder) => embedder.embed_index_ref(texts, threads),
|
||||
SubEmbedder::Ollama(embedder) => embedder.embed_index_ref(texts, threads),
|
||||
SubEmbedder::UserProvided(embedder) => embedder.embed_index_ref(texts),
|
||||
SubEmbedder::Rest(embedder) => embedder.embed_index_ref(texts, threads),
|
||||
}
|
||||
}
|
||||
|
||||
/// Indicates the preferred number of chunks to pass to [`Self::embed_chunks`]
|
||||
pub fn chunk_count_hint(&self) -> usize {
|
||||
match self {
|
||||
SubEmbedder::HuggingFace(embedder) => embedder.chunk_count_hint(),
|
||||
SubEmbedder::OpenAi(embedder) => embedder.chunk_count_hint(),
|
||||
SubEmbedder::Ollama(embedder) => embedder.chunk_count_hint(),
|
||||
SubEmbedder::UserProvided(_) => 100,
|
||||
SubEmbedder::Rest(embedder) => embedder.chunk_count_hint(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Indicates the preferred number of texts in a single chunk passed to [`Self::embed`]
|
||||
pub fn prompt_count_in_chunk_hint(&self) -> usize {
|
||||
match self {
|
||||
SubEmbedder::HuggingFace(embedder) => embedder.prompt_count_in_chunk_hint(),
|
||||
SubEmbedder::OpenAi(embedder) => embedder.prompt_count_in_chunk_hint(),
|
||||
SubEmbedder::Ollama(embedder) => embedder.prompt_count_in_chunk_hint(),
|
||||
SubEmbedder::UserProvided(_) => 1,
|
||||
SubEmbedder::Rest(embedder) => embedder.prompt_count_in_chunk_hint(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn uses_document_template(&self) -> bool {
|
||||
match self {
|
||||
SubEmbedder::HuggingFace(_)
|
||||
| SubEmbedder::OpenAi(_)
|
||||
| SubEmbedder::Ollama(_)
|
||||
| SubEmbedder::Rest(_) => true,
|
||||
SubEmbedder::UserProvided(_) => false,
|
||||
}
|
||||
}
|
||||
|
||||
/// Indicates the dimensions of a single embedding produced by the embedder.
|
||||
pub fn dimensions(&self) -> usize {
|
||||
match self {
|
||||
SubEmbedder::HuggingFace(embedder) => embedder.dimensions(),
|
||||
SubEmbedder::OpenAi(embedder) => embedder.dimensions(),
|
||||
SubEmbedder::Ollama(embedder) => embedder.dimensions(),
|
||||
SubEmbedder::UserProvided(embedder) => embedder.dimensions(),
|
||||
SubEmbedder::Rest(embedder) => embedder.dimensions(),
|
||||
}
|
||||
}
|
||||
|
||||
/// An optional distribution used to apply an affine transformation to the similarity score of a document.
|
||||
pub fn distribution(&self) -> Option<DistributionShift> {
|
||||
match self {
|
||||
SubEmbedder::HuggingFace(embedder) => embedder.distribution(),
|
||||
SubEmbedder::OpenAi(embedder) => embedder.distribution(),
|
||||
SubEmbedder::Ollama(embedder) => embedder.distribution(),
|
||||
SubEmbedder::UserProvided(embedder) => embedder.distribution(),
|
||||
SubEmbedder::Rest(embedder) => embedder.distribution(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn check_similarity(
|
||||
left: Vec<Embedding>,
|
||||
right: Vec<Embedding>,
|
||||
hint: CompositeEmbedderContainsHuggingFace,
|
||||
) -> Result<(), NewEmbedderError> {
|
||||
if left.len() != right.len() {
|
||||
return Err(NewEmbedderError::composite_embedding_count_mismatch(left.len(), right.len()));
|
||||
}
|
||||
|
||||
for (left, right) in left.into_iter().zip(right) {
|
||||
let left = arroy::internals::UnalignedVector::from_slice(&left);
|
||||
let right = arroy::internals::UnalignedVector::from_slice(&right);
|
||||
let left = arroy::internals::Leaf {
|
||||
header: arroy::distances::Cosine::new_header(&left),
|
||||
vector: left,
|
||||
};
|
||||
let right = arroy::internals::Leaf {
|
||||
header: arroy::distances::Cosine::new_header(&right),
|
||||
vector: right,
|
||||
};
|
||||
|
||||
let distance = arroy::distances::Cosine::built_distance(&left, &right);
|
||||
|
||||
if distance > super::MAX_COMPOSITE_DISTANCE {
|
||||
return Err(NewEmbedderError::composite_embedding_value_mismatch(distance, hint));
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn configuration_hint(
|
||||
search: &SubEmbedder,
|
||||
index: &SubEmbedder,
|
||||
) -> CompositeEmbedderContainsHuggingFace {
|
||||
match (search, index) {
|
||||
(SubEmbedder::HuggingFace(_), SubEmbedder::HuggingFace(_)) => {
|
||||
CompositeEmbedderContainsHuggingFace::Both
|
||||
}
|
||||
(SubEmbedder::HuggingFace(_), _) => CompositeEmbedderContainsHuggingFace::Search,
|
||||
(_, SubEmbedder::HuggingFace(_)) => CompositeEmbedderContainsHuggingFace::Indexing,
|
||||
_ => CompositeEmbedderContainsHuggingFace::None,
|
||||
}
|
||||
}
|
@ -6,6 +6,7 @@ use hf_hub::api::sync::ApiError;
|
||||
|
||||
use super::parsed_vectors::ParsedVectorsDiff;
|
||||
use super::rest::ConfigurationSource;
|
||||
use super::MAX_COMPOSITE_DISTANCE;
|
||||
use crate::error::FaultSource;
|
||||
use crate::update::new::vector_document::VectorDocument;
|
||||
use crate::{FieldDistribution, PanicCatched};
|
||||
@ -335,6 +336,77 @@ impl NewEmbedderError {
|
||||
pub(crate) fn ollama_unsupported_url(url: String) -> NewEmbedderError {
|
||||
Self { kind: NewEmbedderErrorKind::OllamaUnsupportedUrl(url), fault: FaultSource::User }
|
||||
}
|
||||
|
||||
pub(crate) fn composite_dimensions_mismatch(
|
||||
search_dimensions: usize,
|
||||
index_dimensions: usize,
|
||||
) -> NewEmbedderError {
|
||||
Self {
|
||||
kind: NewEmbedderErrorKind::CompositeDimensionsMismatch {
|
||||
search_dimensions,
|
||||
index_dimensions,
|
||||
},
|
||||
fault: FaultSource::User,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn composite_test_embedding_failed(
|
||||
inner: EmbedError,
|
||||
failing_embedder: &'static str,
|
||||
) -> NewEmbedderError {
|
||||
Self {
|
||||
kind: NewEmbedderErrorKind::CompositeTestEmbeddingFailed { inner, failing_embedder },
|
||||
fault: FaultSource::Runtime,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn composite_embedding_count_mismatch(
|
||||
search_count: usize,
|
||||
index_count: usize,
|
||||
) -> NewEmbedderError {
|
||||
Self {
|
||||
kind: NewEmbedderErrorKind::CompositeEmbeddingCountMismatch {
|
||||
search_count,
|
||||
index_count,
|
||||
},
|
||||
fault: FaultSource::Runtime,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn composite_embedding_value_mismatch(
|
||||
distance: f32,
|
||||
hint: CompositeEmbedderContainsHuggingFace,
|
||||
) -> NewEmbedderError {
|
||||
Self {
|
||||
kind: NewEmbedderErrorKind::CompositeEmbeddingValueMismatch { distance, hint },
|
||||
fault: FaultSource::User,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub enum CompositeEmbedderContainsHuggingFace {
|
||||
Both,
|
||||
Search,
|
||||
Indexing,
|
||||
None,
|
||||
}
|
||||
|
||||
impl std::fmt::Display for CompositeEmbedderContainsHuggingFace {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
CompositeEmbedderContainsHuggingFace::Both => f.write_str(
|
||||
"\n - Make sure the `model`, `revision` and `pooling` of both embedders match.",
|
||||
),
|
||||
CompositeEmbedderContainsHuggingFace::Search => f.write_str(
|
||||
"\n - Consider trying a different `pooling` method for the search embedder.",
|
||||
),
|
||||
CompositeEmbedderContainsHuggingFace::Indexing => f.write_str(
|
||||
"\n - Consider trying a different `pooling` method for the indexing embedder.",
|
||||
),
|
||||
CompositeEmbedderContainsHuggingFace::None => Ok(()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
@ -419,6 +491,14 @@ pub enum NewEmbedderErrorKind {
|
||||
CouldNotParseTemplate(String),
|
||||
#[error("unsupported Ollama URL.\n - For `ollama` sources, the URL must end with `/api/embed` or `/api/embeddings`\n - Got `{0}`")]
|
||||
OllamaUnsupportedUrl(String),
|
||||
#[error("error while generating test embeddings.\n - the dimensions of embeddings produced at search time and at indexing time don't match.\n - Search time dimensions: {search_dimensions}\n - Indexing time dimensions: {index_dimensions}\n - Note: Dimensions of embeddings produced by both embedders are required to match.")]
|
||||
CompositeDimensionsMismatch { search_dimensions: usize, index_dimensions: usize },
|
||||
#[error("error while generating test embeddings.\n - could not generate test embedding with embedder at {failing_embedder} time.\n - Embedding failed with {inner}")]
|
||||
CompositeTestEmbeddingFailed { inner: EmbedError, failing_embedder: &'static str },
|
||||
#[error("error while generating test embeddings.\n - the number of generated embeddings differs.\n - {search_count} embeddings for the search time embedder.\n - {index_count} embeddings for the indexing time embedder.")]
|
||||
CompositeEmbeddingCountMismatch { search_count: usize, index_count: usize },
|
||||
#[error("error while generating test embeddings.\n - the embeddings produced at search time and indexing time are not similar enough.\n - angular distance {distance}\n - Meilisearch requires a maximum distance of {MAX_COMPOSITE_DISTANCE}.\n - Note: check that both embedders produce similar embeddings.{hint}")]
|
||||
CompositeEmbeddingValueMismatch { distance: f32, hint: CompositeEmbedderContainsHuggingFace },
|
||||
}
|
||||
|
||||
pub struct PossibleEmbeddingMistakes {
|
||||
|
@ -15,6 +15,7 @@ use self::error::{EmbedError, NewEmbedderError};
|
||||
use crate::prompt::{Prompt, PromptData};
|
||||
use crate::ThreadPoolNoAbort;
|
||||
|
||||
pub mod composite;
|
||||
pub mod error;
|
||||
pub mod hf;
|
||||
pub mod json_template;
|
||||
@ -31,6 +32,7 @@ pub use self::error::Error;
|
||||
pub type Embedding = Vec<f32>;
|
||||
|
||||
pub const REQUEST_PARALLELISM: usize = 40;
|
||||
pub const MAX_COMPOSITE_DISTANCE: f32 = 0.01;
|
||||
|
||||
pub struct ArroyWrapper {
|
||||
quantized: bool,
|
||||
|
Loading…
x
Reference in New Issue
Block a user