mirror of
https://github.com/meilisearch/meilisearch.git
synced 2024-11-22 18:17:39 +08:00
Fix vector error messages
This commit is contained in:
parent
bfdcd1cf33
commit
3b0cb5b487
@ -151,6 +151,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> {
|
|||||||
if new_rendered != old_rendered {
|
if new_rendered != old_rendered {
|
||||||
chunks.set_autogenerated(
|
chunks.set_autogenerated(
|
||||||
update.docid(),
|
update.docid(),
|
||||||
|
update.external_document_id(),
|
||||||
new_rendered,
|
new_rendered,
|
||||||
&unused_vectors_distribution,
|
&unused_vectors_distribution,
|
||||||
)?;
|
)?;
|
||||||
@ -178,6 +179,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> {
|
|||||||
if new_rendered != old_rendered {
|
if new_rendered != old_rendered {
|
||||||
chunks.set_autogenerated(
|
chunks.set_autogenerated(
|
||||||
update.docid(),
|
update.docid(),
|
||||||
|
update.external_document_id(),
|
||||||
new_rendered,
|
new_rendered,
|
||||||
&unused_vectors_distribution,
|
&unused_vectors_distribution,
|
||||||
)?;
|
)?;
|
||||||
@ -221,6 +223,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> {
|
|||||||
)?;
|
)?;
|
||||||
chunks.set_autogenerated(
|
chunks.set_autogenerated(
|
||||||
insertion.docid(),
|
insertion.docid(),
|
||||||
|
insertion.external_document_id(),
|
||||||
rendered,
|
rendered,
|
||||||
&unused_vectors_distribution,
|
&unused_vectors_distribution,
|
||||||
)?;
|
)?;
|
||||||
@ -233,6 +236,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> {
|
|||||||
)?;
|
)?;
|
||||||
chunks.set_autogenerated(
|
chunks.set_autogenerated(
|
||||||
insertion.docid(),
|
insertion.docid(),
|
||||||
|
insertion.external_document_id(),
|
||||||
rendered,
|
rendered,
|
||||||
&unused_vectors_distribution,
|
&unused_vectors_distribution,
|
||||||
)?;
|
)?;
|
||||||
@ -268,6 +272,7 @@ struct Chunks<'a, 'extractor> {
|
|||||||
user_provided: &'a RefCell<EmbeddingExtractorData<'extractor>>,
|
user_provided: &'a RefCell<EmbeddingExtractorData<'extractor>>,
|
||||||
threads: &'a ThreadPoolNoAbort,
|
threads: &'a ThreadPoolNoAbort,
|
||||||
sender: &'a EmbeddingSender<'a>,
|
sender: &'a EmbeddingSender<'a>,
|
||||||
|
has_manual_generation: Option<&'a str>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a, 'extractor> Chunks<'a, 'extractor> {
|
impl<'a, 'extractor> Chunks<'a, 'extractor> {
|
||||||
@ -297,15 +302,22 @@ impl<'a, 'extractor> Chunks<'a, 'extractor> {
|
|||||||
embedder_id,
|
embedder_id,
|
||||||
embedder_name,
|
embedder_name,
|
||||||
user_provided,
|
user_provided,
|
||||||
|
has_manual_generation: None,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn set_autogenerated(
|
pub fn set_autogenerated(
|
||||||
&mut self,
|
&mut self,
|
||||||
docid: DocumentId,
|
docid: DocumentId,
|
||||||
|
external_docid: &'a str,
|
||||||
rendered: &'a str,
|
rendered: &'a str,
|
||||||
unused_vectors_distribution: &UnusedVectorsDistributionBump,
|
unused_vectors_distribution: &UnusedVectorsDistributionBump,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
|
let is_manual = matches!(&self.embedder, &Embedder::UserProvided(_));
|
||||||
|
if is_manual {
|
||||||
|
self.has_manual_generation.get_or_insert(external_docid);
|
||||||
|
}
|
||||||
|
|
||||||
if self.texts.len() < self.texts.capacity() {
|
if self.texts.len() < self.texts.capacity() {
|
||||||
self.texts.push(rendered);
|
self.texts.push(rendered);
|
||||||
self.ids.push(docid);
|
self.ids.push(docid);
|
||||||
@ -322,6 +334,7 @@ impl<'a, 'extractor> Chunks<'a, 'extractor> {
|
|||||||
unused_vectors_distribution,
|
unused_vectors_distribution,
|
||||||
self.threads,
|
self.threads,
|
||||||
self.sender,
|
self.sender,
|
||||||
|
self.has_manual_generation.take(),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -339,6 +352,7 @@ impl<'a, 'extractor> Chunks<'a, 'extractor> {
|
|||||||
unused_vectors_distribution,
|
unused_vectors_distribution,
|
||||||
self.threads,
|
self.threads,
|
||||||
self.sender,
|
self.sender,
|
||||||
|
self.has_manual_generation,
|
||||||
);
|
);
|
||||||
// optimization: don't run bvec dtors as they only contain bumpalo allocated stuff
|
// optimization: don't run bvec dtors as they only contain bumpalo allocated stuff
|
||||||
std::mem::forget(self);
|
std::mem::forget(self);
|
||||||
@ -356,7 +370,46 @@ impl<'a, 'extractor> Chunks<'a, 'extractor> {
|
|||||||
unused_vectors_distribution: &UnusedVectorsDistributionBump,
|
unused_vectors_distribution: &UnusedVectorsDistributionBump,
|
||||||
threads: &ThreadPoolNoAbort,
|
threads: &ThreadPoolNoAbort,
|
||||||
sender: &EmbeddingSender<'a>,
|
sender: &EmbeddingSender<'a>,
|
||||||
|
has_manual_generation: Option<&'a str>,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
|
if let Some(external_docid) = has_manual_generation {
|
||||||
|
let mut msg = format!(
|
||||||
|
r"While embedding documents for embedder `{embedder_name}`: no vectors provided for document `{}`{}",
|
||||||
|
external_docid,
|
||||||
|
if ids.len() > 1 {
|
||||||
|
format!(" and at least {} other document(s)", ids.len() - 1)
|
||||||
|
} else {
|
||||||
|
"".to_string()
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
msg += &format!("\n- Note: `{embedder_name}` has `source: userProvided`, so documents must provide embeddings as an array in `_vectors.{embedder_name}`.");
|
||||||
|
|
||||||
|
let mut hint_count = 0;
|
||||||
|
|
||||||
|
for (vector_misspelling, count) in possible_embedding_mistakes.vector_mistakes().take(2)
|
||||||
|
{
|
||||||
|
msg += &format!("\n- Hint: try replacing `{vector_misspelling}` by `_vectors` in {count} document(s).");
|
||||||
|
hint_count += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (embedder_misspelling, count) in possible_embedding_mistakes
|
||||||
|
.embedder_mistakes_bump(embedder_name, unused_vectors_distribution)
|
||||||
|
.take(2)
|
||||||
|
{
|
||||||
|
msg += &format!("\n- Hint: try replacing `_vectors.{embedder_misspelling}` by `_vectors.{embedder_name}` in {count} document(s).");
|
||||||
|
hint_count += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if hint_count == 0 {
|
||||||
|
msg += &format!(
|
||||||
|
"\n- Hint: opt-out for a document with `_vectors.{embedder_name}: null`"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
return Err(crate::Error::UserError(crate::UserError::DocumentEmbeddingError(msg)));
|
||||||
|
}
|
||||||
|
|
||||||
let res = match embedder.embed_chunks_ref(texts.as_slice(), threads) {
|
let res = match embedder.embed_chunks_ref(texts.as_slice(), threads) {
|
||||||
Ok(embeddings) => {
|
Ok(embeddings) => {
|
||||||
for (docid, embedding) in ids.into_iter().zip(embeddings) {
|
for (docid, embedding) in ids.into_iter().zip(embeddings) {
|
||||||
|
@ -648,7 +648,7 @@ impl Embedder {
|
|||||||
Embedder::HuggingFace(embedder) => embedder.chunk_count_hint(),
|
Embedder::HuggingFace(embedder) => embedder.chunk_count_hint(),
|
||||||
Embedder::OpenAi(embedder) => embedder.chunk_count_hint(),
|
Embedder::OpenAi(embedder) => embedder.chunk_count_hint(),
|
||||||
Embedder::Ollama(embedder) => embedder.chunk_count_hint(),
|
Embedder::Ollama(embedder) => embedder.chunk_count_hint(),
|
||||||
Embedder::UserProvided(_) => 1,
|
Embedder::UserProvided(_) => 100,
|
||||||
Embedder::Rest(embedder) => embedder.chunk_count_hint(),
|
Embedder::Rest(embedder) => embedder.chunk_count_hint(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user