mirror of
https://github.com/meilisearch/meilisearch.git
synced 2024-11-25 19:45:05 +08:00
Reintroduce vector errors
This commit is contained in:
parent
0f6a1dbce7
commit
df5bc3c9fd
@ -11,13 +11,16 @@ use crate::update::new::channel::EmbeddingSender;
|
|||||||
use crate::update::new::indexer::document_changes::{Extractor, FullySend};
|
use crate::update::new::indexer::document_changes::{Extractor, FullySend};
|
||||||
use crate::update::new::vector_document::VectorDocument;
|
use crate::update::new::vector_document::VectorDocument;
|
||||||
use crate::update::new::DocumentChange;
|
use crate::update::new::DocumentChange;
|
||||||
use crate::vector::error::EmbedErrorKind;
|
use crate::vector::error::{
|
||||||
|
EmbedErrorKind, PossibleEmbeddingMistakes, UnusedVectorsDistributionBump,
|
||||||
|
};
|
||||||
use crate::vector::{Embedder, Embedding, EmbeddingConfigs};
|
use crate::vector::{Embedder, Embedding, EmbeddingConfigs};
|
||||||
use crate::{DocumentId, InternalError, Result, ThreadPoolNoAbort, UserError};
|
use crate::{DocumentId, FieldDistribution, InternalError, Result, ThreadPoolNoAbort, UserError};
|
||||||
|
|
||||||
pub struct EmbeddingExtractor<'a> {
|
pub struct EmbeddingExtractor<'a> {
|
||||||
embedders: &'a EmbeddingConfigs,
|
embedders: &'a EmbeddingConfigs,
|
||||||
sender: &'a EmbeddingSender<'a>,
|
sender: &'a EmbeddingSender<'a>,
|
||||||
|
possible_embedding_mistakes: PossibleEmbeddingMistakes,
|
||||||
threads: &'a ThreadPoolNoAbort,
|
threads: &'a ThreadPoolNoAbort,
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -25,9 +28,11 @@ impl<'a> EmbeddingExtractor<'a> {
|
|||||||
pub fn new(
|
pub fn new(
|
||||||
embedders: &'a EmbeddingConfigs,
|
embedders: &'a EmbeddingConfigs,
|
||||||
sender: &'a EmbeddingSender<'a>,
|
sender: &'a EmbeddingSender<'a>,
|
||||||
|
field_distribution: &'a FieldDistribution,
|
||||||
threads: &'a ThreadPoolNoAbort,
|
threads: &'a ThreadPoolNoAbort,
|
||||||
) -> Self {
|
) -> Self {
|
||||||
Self { embedders, sender, threads }
|
let possible_embedding_mistakes = PossibleEmbeddingMistakes::new(field_distribution);
|
||||||
|
Self { embedders, sender, threads, possible_embedding_mistakes }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -50,6 +55,8 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> {
|
|||||||
>,
|
>,
|
||||||
) -> crate::Result<()> {
|
) -> crate::Result<()> {
|
||||||
let embedders = self.embedders.inner_as_ref();
|
let embedders = self.embedders.inner_as_ref();
|
||||||
|
let mut unused_vectors_distribution =
|
||||||
|
UnusedVectorsDistributionBump::new_in(&context.doc_alloc);
|
||||||
|
|
||||||
let mut all_chunks = BVec::with_capacity_in(embedders.len(), &context.doc_alloc);
|
let mut all_chunks = BVec::with_capacity_in(embedders.len(), &context.doc_alloc);
|
||||||
for (embedder_name, (embedder, prompt, _is_quantized)) in embedders {
|
for (embedder_name, (embedder, prompt, _is_quantized)) in embedders {
|
||||||
@ -66,6 +73,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> {
|
|||||||
embedder_name,
|
embedder_name,
|
||||||
prompt,
|
prompt,
|
||||||
&context.data.0,
|
&context.data.0,
|
||||||
|
&self.possible_embedding_mistakes,
|
||||||
self.threads,
|
self.threads,
|
||||||
self.sender,
|
self.sender,
|
||||||
&context.doc_alloc,
|
&context.doc_alloc,
|
||||||
@ -87,6 +95,10 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> {
|
|||||||
)?;
|
)?;
|
||||||
let new_vectors = update.updated_vectors(&context.doc_alloc)?;
|
let new_vectors = update.updated_vectors(&context.doc_alloc)?;
|
||||||
|
|
||||||
|
if let Some(new_vectors) = &new_vectors {
|
||||||
|
unused_vectors_distribution.append(new_vectors);
|
||||||
|
}
|
||||||
|
|
||||||
for chunks in &mut all_chunks {
|
for chunks in &mut all_chunks {
|
||||||
let embedder_name = chunks.embedder_name();
|
let embedder_name = chunks.embedder_name();
|
||||||
let prompt = chunks.prompt();
|
let prompt = chunks.prompt();
|
||||||
@ -128,7 +140,11 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> {
|
|||||||
&context.doc_alloc,
|
&context.doc_alloc,
|
||||||
)?;
|
)?;
|
||||||
if new_rendered != old_rendered {
|
if new_rendered != old_rendered {
|
||||||
chunks.set_autogenerated(update.docid(), new_rendered)?;
|
chunks.set_autogenerated(
|
||||||
|
update.docid(),
|
||||||
|
new_rendered,
|
||||||
|
&unused_vectors_distribution,
|
||||||
|
)?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else if old_vectors.regenerate {
|
} else if old_vectors.regenerate {
|
||||||
@ -151,17 +167,25 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> {
|
|||||||
&context.doc_alloc,
|
&context.doc_alloc,
|
||||||
)?;
|
)?;
|
||||||
if new_rendered != old_rendered {
|
if new_rendered != old_rendered {
|
||||||
chunks.set_autogenerated(update.docid(), new_rendered)?;
|
chunks.set_autogenerated(
|
||||||
|
update.docid(),
|
||||||
|
new_rendered,
|
||||||
|
&unused_vectors_distribution,
|
||||||
|
)?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
DocumentChange::Insertion(insertion) => {
|
DocumentChange::Insertion(insertion) => {
|
||||||
|
let new_vectors = insertion.inserted_vectors(&context.doc_alloc)?;
|
||||||
|
if let Some(new_vectors) = &new_vectors {
|
||||||
|
unused_vectors_distribution.append(new_vectors);
|
||||||
|
}
|
||||||
|
|
||||||
for chunks in &mut all_chunks {
|
for chunks in &mut all_chunks {
|
||||||
let embedder_name = chunks.embedder_name();
|
let embedder_name = chunks.embedder_name();
|
||||||
let prompt = chunks.prompt();
|
let prompt = chunks.prompt();
|
||||||
// if no inserted vectors, then regenerate: true + no embeddings => autogenerate
|
// if no inserted vectors, then regenerate: true + no embeddings => autogenerate
|
||||||
let new_vectors = insertion.inserted_vectors(&context.doc_alloc)?;
|
|
||||||
if let Some(new_vectors) = new_vectors.as_ref().and_then(|new_vectors| {
|
if let Some(new_vectors) = new_vectors.as_ref().and_then(|new_vectors| {
|
||||||
new_vectors.vectors_for_key(embedder_name).transpose()
|
new_vectors.vectors_for_key(embedder_name).transpose()
|
||||||
}) {
|
}) {
|
||||||
@ -178,7 +202,11 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> {
|
|||||||
context.new_fields_ids_map,
|
context.new_fields_ids_map,
|
||||||
&context.doc_alloc,
|
&context.doc_alloc,
|
||||||
)?;
|
)?;
|
||||||
chunks.set_autogenerated(insertion.docid(), rendered)?;
|
chunks.set_autogenerated(
|
||||||
|
insertion.docid(),
|
||||||
|
rendered,
|
||||||
|
&unused_vectors_distribution,
|
||||||
|
)?;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
let rendered = prompt.render_document(
|
let rendered = prompt.render_document(
|
||||||
@ -186,7 +214,11 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> {
|
|||||||
context.new_fields_ids_map,
|
context.new_fields_ids_map,
|
||||||
&context.doc_alloc,
|
&context.doc_alloc,
|
||||||
)?;
|
)?;
|
||||||
chunks.set_autogenerated(insertion.docid(), rendered)?;
|
chunks.set_autogenerated(
|
||||||
|
insertion.docid(),
|
||||||
|
rendered,
|
||||||
|
&unused_vectors_distribution,
|
||||||
|
)?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -194,7 +226,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
for chunk in all_chunks {
|
for chunk in all_chunks {
|
||||||
chunk.drain()?;
|
chunk.drain(&unused_vectors_distribution)?;
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@ -215,7 +247,7 @@ struct Chunks<'a> {
|
|||||||
embedder_id: u8,
|
embedder_id: u8,
|
||||||
embedder_name: &'a str,
|
embedder_name: &'a str,
|
||||||
prompt: &'a Prompt,
|
prompt: &'a Prompt,
|
||||||
|
possible_embedding_mistakes: &'a PossibleEmbeddingMistakes,
|
||||||
user_provided: &'a RefCell<HashMap<String, DelAddRoaringBitmap>>,
|
user_provided: &'a RefCell<HashMap<String, DelAddRoaringBitmap>>,
|
||||||
threads: &'a ThreadPoolNoAbort,
|
threads: &'a ThreadPoolNoAbort,
|
||||||
sender: &'a EmbeddingSender<'a>,
|
sender: &'a EmbeddingSender<'a>,
|
||||||
@ -229,6 +261,7 @@ impl<'a> Chunks<'a> {
|
|||||||
embedder_name: &'a str,
|
embedder_name: &'a str,
|
||||||
prompt: &'a Prompt,
|
prompt: &'a Prompt,
|
||||||
user_provided: &'a RefCell<HashMap<String, DelAddRoaringBitmap>>,
|
user_provided: &'a RefCell<HashMap<String, DelAddRoaringBitmap>>,
|
||||||
|
possible_embedding_mistakes: &'a PossibleEmbeddingMistakes,
|
||||||
threads: &'a ThreadPoolNoAbort,
|
threads: &'a ThreadPoolNoAbort,
|
||||||
sender: &'a EmbeddingSender<'a>,
|
sender: &'a EmbeddingSender<'a>,
|
||||||
doc_alloc: &'a Bump,
|
doc_alloc: &'a Bump,
|
||||||
@ -241,6 +274,7 @@ impl<'a> Chunks<'a> {
|
|||||||
ids,
|
ids,
|
||||||
embedder,
|
embedder,
|
||||||
prompt,
|
prompt,
|
||||||
|
possible_embedding_mistakes,
|
||||||
threads,
|
threads,
|
||||||
sender,
|
sender,
|
||||||
embedder_id,
|
embedder_id,
|
||||||
@ -249,7 +283,12 @@ impl<'a> Chunks<'a> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn set_autogenerated(&mut self, docid: DocumentId, rendered: &'a str) -> Result<()> {
|
pub fn set_autogenerated(
|
||||||
|
&mut self,
|
||||||
|
docid: DocumentId,
|
||||||
|
rendered: &'a str,
|
||||||
|
unused_vectors_distribution: &UnusedVectorsDistributionBump,
|
||||||
|
) -> Result<()> {
|
||||||
if self.texts.len() < self.texts.capacity() {
|
if self.texts.len() < self.texts.capacity() {
|
||||||
self.texts.push(rendered);
|
self.texts.push(rendered);
|
||||||
self.ids.push(docid);
|
self.ids.push(docid);
|
||||||
@ -262,18 +301,25 @@ impl<'a> Chunks<'a> {
|
|||||||
self.embedder,
|
self.embedder,
|
||||||
self.embedder_id,
|
self.embedder_id,
|
||||||
self.embedder_name,
|
self.embedder_name,
|
||||||
|
self.possible_embedding_mistakes,
|
||||||
|
unused_vectors_distribution,
|
||||||
self.threads,
|
self.threads,
|
||||||
self.sender,
|
self.sender,
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn drain(mut self) -> Result<()> {
|
pub fn drain(
|
||||||
|
mut self,
|
||||||
|
unused_vectors_distribution: &UnusedVectorsDistributionBump,
|
||||||
|
) -> Result<()> {
|
||||||
let res = Self::embed_chunks(
|
let res = Self::embed_chunks(
|
||||||
&mut self.texts,
|
&mut self.texts,
|
||||||
&mut self.ids,
|
&mut self.ids,
|
||||||
self.embedder,
|
self.embedder,
|
||||||
self.embedder_id,
|
self.embedder_id,
|
||||||
self.embedder_name,
|
self.embedder_name,
|
||||||
|
self.possible_embedding_mistakes,
|
||||||
|
unused_vectors_distribution,
|
||||||
self.threads,
|
self.threads,
|
||||||
self.sender,
|
self.sender,
|
||||||
);
|
);
|
||||||
@ -285,11 +331,13 @@ impl<'a> Chunks<'a> {
|
|||||||
pub fn embed_chunks(
|
pub fn embed_chunks(
|
||||||
texts: &mut BVec<'a, &'a str>,
|
texts: &mut BVec<'a, &'a str>,
|
||||||
ids: &mut BVec<'a, DocumentId>,
|
ids: &mut BVec<'a, DocumentId>,
|
||||||
embedder: &'a Embedder,
|
embedder: &Embedder,
|
||||||
embedder_id: u8,
|
embedder_id: u8,
|
||||||
embedder_name: &str,
|
embedder_name: &str,
|
||||||
threads: &'a ThreadPoolNoAbort,
|
possible_embedding_mistakes: &PossibleEmbeddingMistakes,
|
||||||
sender: &'a EmbeddingSender<'a>,
|
unused_vectors_distribution: &UnusedVectorsDistributionBump,
|
||||||
|
threads: &ThreadPoolNoAbort,
|
||||||
|
sender: &EmbeddingSender<'a>,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
let res = match embedder.embed_chunks_ref(texts.as_slice(), threads) {
|
let res = match embedder.embed_chunks_ref(texts.as_slice(), threads) {
|
||||||
Ok(embeddings) => {
|
Ok(embeddings) => {
|
||||||
@ -312,10 +360,8 @@ impl<'a> Chunks<'a> {
|
|||||||
msg += &format!("\n- Note: `{embedder_name}` has `source: userProvided`, so documents must provide embeddings as an array in `_vectors.{embedder_name}`.");
|
msg += &format!("\n- Note: `{embedder_name}` has `source: userProvided`, so documents must provide embeddings as an array in `_vectors.{embedder_name}`.");
|
||||||
}
|
}
|
||||||
|
|
||||||
/// FIXME: reintroduce possible_embedding_mistakes and possible_embedding_mistakes
|
|
||||||
let mut hint_count = 0;
|
let mut hint_count = 0;
|
||||||
|
|
||||||
/*
|
|
||||||
for (vector_misspelling, count) in
|
for (vector_misspelling, count) in
|
||||||
possible_embedding_mistakes.vector_mistakes().take(2)
|
possible_embedding_mistakes.vector_mistakes().take(2)
|
||||||
{
|
{
|
||||||
@ -324,13 +370,13 @@ impl<'a> Chunks<'a> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
for (embedder_misspelling, count) in possible_embedding_mistakes
|
for (embedder_misspelling, count) in possible_embedding_mistakes
|
||||||
.embedder_mistakes(embedder_name, unused_vectors_distribution)
|
.embedder_mistakes_bump(embedder_name, unused_vectors_distribution)
|
||||||
.take(2)
|
.take(2)
|
||||||
{
|
{
|
||||||
msg += &format!("\n- Hint: try replacing `_vectors.{embedder_misspelling}` by `_vectors.{embedder_name}` in {count} document(s).");
|
msg += &format!("\n- Hint: try replacing `_vectors.{embedder_misspelling}` by `_vectors.{embedder_name}` in {count} document(s).");
|
||||||
hint_count += 1;
|
hint_count += 1;
|
||||||
}
|
}
|
||||||
*/
|
|
||||||
if hint_count == 0 {
|
if hint_count == 0 {
|
||||||
if let EmbedErrorKind::ManualEmbed(_) = &error.kind {
|
if let EmbedErrorKind::ManualEmbed(_) = &error.kind {
|
||||||
msg += &format!(
|
msg += &format!(
|
||||||
|
@ -296,7 +296,7 @@ where
|
|||||||
}
|
}
|
||||||
/// FIXME: need access to `merger_sender`
|
/// FIXME: need access to `merger_sender`
|
||||||
let embedding_sender = todo!();
|
let embedding_sender = todo!();
|
||||||
let extractor = EmbeddingExtractor::new(embedders, &embedding_sender, request_threads());
|
let extractor = EmbeddingExtractor::new(embedders, &embedding_sender, &field_distribution, request_threads());
|
||||||
let datastore = ThreadLocal::with_capacity(pool.current_num_threads());
|
let datastore = ThreadLocal::with_capacity(pool.current_num_threads());
|
||||||
|
|
||||||
for_each_document_change(document_changes, &extractor, indexing_context, &mut extractor_allocs, &datastore)?;
|
for_each_document_change(document_changes, &extractor, indexing_context, &mut extractor_allocs, &datastore)?;
|
||||||
|
@ -1,11 +1,13 @@
|
|||||||
use std::collections::BTreeMap;
|
use std::collections::BTreeMap;
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
|
|
||||||
|
use bumpalo::Bump;
|
||||||
use hf_hub::api::sync::ApiError;
|
use hf_hub::api::sync::ApiError;
|
||||||
|
|
||||||
use super::parsed_vectors::ParsedVectorsDiff;
|
use super::parsed_vectors::ParsedVectorsDiff;
|
||||||
use super::rest::ConfigurationSource;
|
use super::rest::ConfigurationSource;
|
||||||
use crate::error::FaultSource;
|
use crate::error::FaultSource;
|
||||||
|
use crate::update::new::vector_document::VectorDocument;
|
||||||
use crate::{FieldDistribution, PanicCatched};
|
use crate::{FieldDistribution, PanicCatched};
|
||||||
|
|
||||||
#[derive(Debug, thiserror::Error)]
|
#[derive(Debug, thiserror::Error)]
|
||||||
@ -417,6 +419,23 @@ impl PossibleEmbeddingMistakes {
|
|||||||
}
|
}
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn embedder_mistakes_bump<'a, 'doc: 'a>(
|
||||||
|
&'a self,
|
||||||
|
embedder_name: &'a str,
|
||||||
|
unused_vectors_distribution: &'a UnusedVectorsDistributionBump<'doc>,
|
||||||
|
) -> impl Iterator<Item = (&'a str, u64)> + 'a {
|
||||||
|
let builder = levenshtein_automata::LevenshteinAutomatonBuilder::new(2, true);
|
||||||
|
let automata = builder.build_dfa(embedder_name);
|
||||||
|
|
||||||
|
unused_vectors_distribution.0.iter().filter_map(move |(field, count)| {
|
||||||
|
match automata.eval(field) {
|
||||||
|
levenshtein_automata::Distance::Exact(0) => None,
|
||||||
|
levenshtein_automata::Distance::Exact(_) => Some((*field, *count)),
|
||||||
|
levenshtein_automata::Distance::AtLeast(_) => None,
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Default)]
|
#[derive(Default)]
|
||||||
@ -433,3 +452,23 @@ impl UnusedVectorsDistribution {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub struct UnusedVectorsDistributionBump<'doc>(
|
||||||
|
hashbrown::HashMap<&'doc str, u64, hashbrown::hash_map::DefaultHashBuilder, &'doc Bump>,
|
||||||
|
);
|
||||||
|
|
||||||
|
impl<'doc> UnusedVectorsDistributionBump<'doc> {
|
||||||
|
pub fn new_in(doc_alloc: &'doc Bump) -> Self {
|
||||||
|
Self(hashbrown::HashMap::new_in(doc_alloc))
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn append(&mut self, vectors: &impl VectorDocument<'doc>) -> Result<(), crate::Error> {
|
||||||
|
for res in vectors.iter_vectors() {
|
||||||
|
let (embedder_name, entry) = res?;
|
||||||
|
if !entry.has_configured_embedder {
|
||||||
|
*self.0.entry(embedder_name).or_default() += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user