mirror of
https://github.com/meilisearch/meilisearch.git
synced 2024-11-22 18:17:39 +08:00
Merge #4906
4906: Add searchable fields to template r=dureuill a=dureuill # Pull Request ## Related issue Fixes #4886 See [public usage](https://meilisearch.notion.site/v1-11-AI-search-changes-0e37727193884a70999f254fa953ce6e#1dd6f0eee5a1422888e1c5d48e107cd1) ## What does this PR do? - `Prompt::render` now requires and uses metadata to indicate if the fields are searchable or not - Changes default template - Updated tests - Correctly reindex vectors when the list of searchable fields changes in a settings update. Co-authored-by: Louis Dureuil <louis@meilisearch.com>
This commit is contained in:
commit
80408c92dc
@ -5617,7 +5617,7 @@ mod tests {
|
|||||||
},
|
},
|
||||||
),
|
),
|
||||||
prompt: PromptData {
|
prompt: PromptData {
|
||||||
template: "{% for field in fields %} {{ field.name }}: {{ field.value }}\n{% endfor %}",
|
template: "{% for field in fields %}{% if field.is_searchable and field.value != nil %}{{ field.name }}: {{ field.value }}\n{% endif %}{% endfor %}",
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
user_provided: RoaringBitmap<[0]>,
|
user_provided: RoaringBitmap<[0]>,
|
||||||
@ -5657,7 +5657,7 @@ mod tests {
|
|||||||
},
|
},
|
||||||
),
|
),
|
||||||
prompt: PromptData {
|
prompt: PromptData {
|
||||||
template: "{% for field in fields %} {{ field.name }}: {{ field.value }}\n{% endfor %}",
|
template: "{% for field in fields %}{% if field.is_searchable and field.value != nil %}{{ field.name }}: {{ field.value }}\n{% endif %}{% endfor %}",
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
user_provided: RoaringBitmap<[]>,
|
user_provided: RoaringBitmap<[]>,
|
||||||
|
@ -190,7 +190,7 @@ async fn secrets_are_hidden_in_settings() {
|
|||||||
"source": "rest",
|
"source": "rest",
|
||||||
"apiKey": "My suXXXXXX...",
|
"apiKey": "My suXXXXXX...",
|
||||||
"dimensions": 4,
|
"dimensions": 4,
|
||||||
"documentTemplate": "{% for field in fields %} {{ field.name }}: {{ field.value }}\n{% endfor %}",
|
"documentTemplate": "{% for field in fields %}{% if field.is_searchable and field.value != nil %}{{ field.name }}: {{ field.value }}\n{% endif %}{% endfor %}",
|
||||||
"url": "https://localhost:7777",
|
"url": "https://localhost:7777",
|
||||||
"request": "{{text}}",
|
"request": "{{text}}",
|
||||||
"response": "{{embedding}}",
|
"response": "{{embedding}}",
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
use std::collections::BTreeMap;
|
||||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||||
|
|
||||||
use meili_snap::{json_string, snapshot};
|
use meili_snap::{json_string, snapshot};
|
||||||
@ -37,6 +38,46 @@ async fn create_mock() -> (MockServer, Value) {
|
|||||||
(mock_server, embedder_settings)
|
(mock_server, embedder_settings)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async fn create_mock_map() -> (MockServer, Value) {
|
||||||
|
let mock_server = MockServer::start().await;
|
||||||
|
|
||||||
|
let text_to_embedding: BTreeMap<_, _> = vec![
|
||||||
|
// text -> embedding
|
||||||
|
("name: kefir\n", [0.0, 0.1, 0.2]),
|
||||||
|
]
|
||||||
|
// turn into btree
|
||||||
|
.into_iter()
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
Mock::given(method("POST"))
|
||||||
|
.and(path("/"))
|
||||||
|
.respond_with(move |req: &Request| {
|
||||||
|
let text: String = req.body_json().unwrap();
|
||||||
|
match text_to_embedding.get(text.as_str()) {
|
||||||
|
Some(embedding) => {
|
||||||
|
ResponseTemplate::new(200).set_body_json(json!({ "data": embedding }))
|
||||||
|
}
|
||||||
|
None => ResponseTemplate::new(404)
|
||||||
|
.set_body_json(json!({"error": "text not found", "text": text})),
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.mount(&mock_server)
|
||||||
|
.await;
|
||||||
|
let url = mock_server.uri();
|
||||||
|
|
||||||
|
let embedder_settings = json!({
|
||||||
|
"source": "rest",
|
||||||
|
"url": url,
|
||||||
|
"dimensions": 3,
|
||||||
|
"request": "{{text}}",
|
||||||
|
"response": {
|
||||||
|
"data": "{{embedding}}"
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
(mock_server, embedder_settings)
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, serde::Deserialize, serde::Serialize)]
|
#[derive(Debug, Clone, serde::Deserialize, serde::Serialize)]
|
||||||
struct MultipleRequest {
|
struct MultipleRequest {
|
||||||
input: Vec<String>,
|
input: Vec<String>,
|
||||||
@ -1100,6 +1141,7 @@ async fn server_returns_bad_request() {
|
|||||||
|
|
||||||
let (response, code) = index
|
let (response, code) = index
|
||||||
.update_settings(json!({
|
.update_settings(json!({
|
||||||
|
"searchableAttributes": ["name", "missing_field"],
|
||||||
"embedders": {
|
"embedders": {
|
||||||
"rest": json!({ "source": "rest", "url": mock.uri(), "request": "{{text}}", "response": "{{embedding}}", "dimensions": 3 }),
|
"rest": json!({ "source": "rest", "url": mock.uri(), "request": "{{text}}", "response": "{{embedding}}", "dimensions": 3 }),
|
||||||
},
|
},
|
||||||
@ -1115,6 +1157,10 @@ async fn server_returns_bad_request() {
|
|||||||
"type": "settingsUpdate",
|
"type": "settingsUpdate",
|
||||||
"canceledBy": null,
|
"canceledBy": null,
|
||||||
"details": {
|
"details": {
|
||||||
|
"searchableAttributes": [
|
||||||
|
"name",
|
||||||
|
"missing_field"
|
||||||
|
],
|
||||||
"embedders": {
|
"embedders": {
|
||||||
"rest": {
|
"rest": {
|
||||||
"source": "rest",
|
"source": "rest",
|
||||||
@ -1148,7 +1194,7 @@ async fn server_returns_bad_request() {
|
|||||||
"indexedDocuments": 0
|
"indexedDocuments": 0
|
||||||
},
|
},
|
||||||
"error": {
|
"error": {
|
||||||
"message": "While embedding documents for embedder `rest`: user error: sent a bad request to embedding server\n - Hint: check that the `request` in the embedder configuration matches the remote server's API\n - server replied with `{\"error\":\"Invalid request: invalid type: string \\\" id: 1\\\\n name: kefir\\\\n\\\", expected struct MultipleRequest at line 1 column 24\"}`",
|
"message": "While embedding documents for embedder `rest`: user error: sent a bad request to embedding server\n - Hint: check that the `request` in the embedder configuration matches the remote server's API\n - server replied with `{\"error\":\"Invalid request: invalid type: string \\\"name: kefir\\\\n\\\", expected struct MultipleRequest at line 1 column 15\"}`",
|
||||||
"code": "vector_embedding_error",
|
"code": "vector_embedding_error",
|
||||||
"type": "invalid_request",
|
"type": "invalid_request",
|
||||||
"link": "https://docs.meilisearch.com/errors#vector_embedding_error"
|
"link": "https://docs.meilisearch.com/errors#vector_embedding_error"
|
||||||
@ -1891,3 +1937,109 @@ async fn server_custom_header() {
|
|||||||
}
|
}
|
||||||
"###);
|
"###);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[actix_rt::test]
|
||||||
|
async fn searchable_reindex() {
|
||||||
|
let (_mock, setting) = create_mock_map().await;
|
||||||
|
let server = get_server_vector().await;
|
||||||
|
let index = server.index("doggo");
|
||||||
|
|
||||||
|
let (response, code) = index
|
||||||
|
.update_settings(json!({
|
||||||
|
"searchableAttributes": ["name", "missing_field"],
|
||||||
|
"embedders": {
|
||||||
|
"rest": setting,
|
||||||
|
},
|
||||||
|
}))
|
||||||
|
.await;
|
||||||
|
snapshot!(code, @"202 Accepted");
|
||||||
|
let task = server.wait_task(response.uid()).await;
|
||||||
|
snapshot!(task, @r###"
|
||||||
|
{
|
||||||
|
"uid": "[uid]",
|
||||||
|
"indexUid": "doggo",
|
||||||
|
"status": "succeeded",
|
||||||
|
"type": "settingsUpdate",
|
||||||
|
"canceledBy": null,
|
||||||
|
"details": {
|
||||||
|
"searchableAttributes": [
|
||||||
|
"name",
|
||||||
|
"missing_field"
|
||||||
|
],
|
||||||
|
"embedders": {
|
||||||
|
"rest": {
|
||||||
|
"source": "rest",
|
||||||
|
"dimensions": 3,
|
||||||
|
"url": "[url]",
|
||||||
|
"request": "{{text}}",
|
||||||
|
"response": {
|
||||||
|
"data": "{{embedding}}"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"error": null,
|
||||||
|
"duration": "[duration]",
|
||||||
|
"enqueuedAt": "[date]",
|
||||||
|
"startedAt": "[date]",
|
||||||
|
"finishedAt": "[date]"
|
||||||
|
}
|
||||||
|
"###);
|
||||||
|
|
||||||
|
let (response, code) =
|
||||||
|
index.add_documents(json!( { "id": 1, "name": "kefir", "breed": "patou" }), None).await;
|
||||||
|
snapshot!(code, @"202 Accepted");
|
||||||
|
let task = server.wait_task(response.uid()).await;
|
||||||
|
snapshot!(task, @r###"
|
||||||
|
{
|
||||||
|
"uid": "[uid]",
|
||||||
|
"indexUid": "doggo",
|
||||||
|
"status": "succeeded",
|
||||||
|
"type": "documentAdditionOrUpdate",
|
||||||
|
"canceledBy": null,
|
||||||
|
"details": {
|
||||||
|
"receivedDocuments": 1,
|
||||||
|
"indexedDocuments": 1
|
||||||
|
},
|
||||||
|
"error": null,
|
||||||
|
"duration": "[duration]",
|
||||||
|
"enqueuedAt": "[date]",
|
||||||
|
"startedAt": "[date]",
|
||||||
|
"finishedAt": "[date]"
|
||||||
|
}
|
||||||
|
"###);
|
||||||
|
|
||||||
|
// triggers reindexing with the new searchable attribute.
|
||||||
|
// as the mock intentionally doesn't know of this text, the task will fail, outputting the putative rendered text.
|
||||||
|
let (response, code) = index
|
||||||
|
.update_settings(json!({
|
||||||
|
"searchableAttributes": ["breed"],
|
||||||
|
}))
|
||||||
|
.await;
|
||||||
|
snapshot!(code, @"202 Accepted");
|
||||||
|
let task = server.wait_task(response.uid()).await;
|
||||||
|
snapshot!(task, @r###"
|
||||||
|
{
|
||||||
|
"uid": "[uid]",
|
||||||
|
"indexUid": "doggo",
|
||||||
|
"status": "failed",
|
||||||
|
"type": "settingsUpdate",
|
||||||
|
"canceledBy": null,
|
||||||
|
"details": {
|
||||||
|
"searchableAttributes": [
|
||||||
|
"breed"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"error": {
|
||||||
|
"message": "While embedding documents for embedder `rest`: error: received unexpected HTTP 404 from embedding server\n - server replied with `{\"error\":\"text not found\",\"text\":\"breed: patou\\n\"}`",
|
||||||
|
"code": "vector_embedding_error",
|
||||||
|
"type": "invalid_request",
|
||||||
|
"link": "https://docs.meilisearch.com/errors#vector_embedding_error"
|
||||||
|
},
|
||||||
|
"duration": "[duration]",
|
||||||
|
"enqueuedAt": "[date]",
|
||||||
|
"startedAt": "[date]",
|
||||||
|
"finishedAt": "[date]"
|
||||||
|
}
|
||||||
|
"###);
|
||||||
|
}
|
||||||
|
@ -5,7 +5,7 @@ use liquid::{ObjectView, ValueView};
|
|||||||
|
|
||||||
use super::document::Document;
|
use super::document::Document;
|
||||||
use super::fields::Fields;
|
use super::fields::Fields;
|
||||||
use crate::FieldsIdsMap;
|
use super::FieldsIdsMapWithMetadata;
|
||||||
|
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
pub struct Context<'a> {
|
pub struct Context<'a> {
|
||||||
@ -14,7 +14,7 @@ pub struct Context<'a> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> Context<'a> {
|
impl<'a> Context<'a> {
|
||||||
pub fn new(document: &'a Document<'a>, field_id_map: &'a FieldsIdsMap) -> Self {
|
pub fn new(document: &'a Document<'a>, field_id_map: &'a FieldsIdsMapWithMetadata<'a>) -> Self {
|
||||||
Self { document, fields: Fields::new(document, field_id_map) }
|
Self { document, fields: Fields::new(document, field_id_map) }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -4,16 +4,20 @@ use liquid::model::{
|
|||||||
use liquid::{ObjectView, ValueView};
|
use liquid::{ObjectView, ValueView};
|
||||||
|
|
||||||
use super::document::Document;
|
use super::document::Document;
|
||||||
use crate::FieldsIdsMap;
|
use super::{FieldMetadata, FieldsIdsMapWithMetadata};
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
pub struct Fields<'a>(Vec<FieldValue<'a>>);
|
pub struct Fields<'a>(Vec<FieldValue<'a>>);
|
||||||
|
|
||||||
impl<'a> Fields<'a> {
|
impl<'a> Fields<'a> {
|
||||||
pub fn new(document: &'a Document<'a>, field_id_map: &'a FieldsIdsMap) -> Self {
|
pub fn new(document: &'a Document<'a>, field_id_map: &'a FieldsIdsMapWithMetadata<'a>) -> Self {
|
||||||
Self(
|
Self(
|
||||||
std::iter::repeat(document)
|
std::iter::repeat(document)
|
||||||
.zip(field_id_map.iter())
|
.zip(field_id_map.iter())
|
||||||
.map(|(document, (_fid, name))| FieldValue { document, name })
|
.map(|(document, (fid, name))| FieldValue {
|
||||||
|
document,
|
||||||
|
name,
|
||||||
|
metadata: field_id_map.metadata(fid).unwrap_or_default(),
|
||||||
|
})
|
||||||
.collect(),
|
.collect(),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
@ -23,6 +27,7 @@ impl<'a> Fields<'a> {
|
|||||||
pub struct FieldValue<'a> {
|
pub struct FieldValue<'a> {
|
||||||
name: &'a str,
|
name: &'a str,
|
||||||
document: &'a Document<'a>,
|
document: &'a Document<'a>,
|
||||||
|
metadata: FieldMetadata,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> ValueView for FieldValue<'a> {
|
impl<'a> ValueView for FieldValue<'a> {
|
||||||
@ -74,6 +79,10 @@ impl<'a> FieldValue<'a> {
|
|||||||
self.document.get(self.name).unwrap_or(&LiquidValue::Nil)
|
self.document.get(self.name).unwrap_or(&LiquidValue::Nil)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn is_searchable(&self) -> &bool {
|
||||||
|
&self.metadata.searchable
|
||||||
|
}
|
||||||
|
|
||||||
pub fn is_empty(&self) -> bool {
|
pub fn is_empty(&self) -> bool {
|
||||||
self.size() == 0
|
self.size() == 0
|
||||||
}
|
}
|
||||||
@ -89,12 +98,14 @@ impl<'a> ObjectView for FieldValue<'a> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn keys<'k>(&'k self) -> Box<dyn Iterator<Item = KStringCow<'k>> + 'k> {
|
fn keys<'k>(&'k self) -> Box<dyn Iterator<Item = KStringCow<'k>> + 'k> {
|
||||||
Box::new(["name", "value"].iter().map(|&x| KStringCow::from_static(x)))
|
Box::new(["name", "value", "is_searchable"].iter().map(|&x| KStringCow::from_static(x)))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn values<'k>(&'k self) -> Box<dyn Iterator<Item = &'k dyn ValueView> + 'k> {
|
fn values<'k>(&'k self) -> Box<dyn Iterator<Item = &'k dyn ValueView> + 'k> {
|
||||||
Box::new(
|
Box::new(
|
||||||
std::iter::once(self.name() as &dyn ValueView).chain(std::iter::once(self.value())),
|
std::iter::once(self.name() as &dyn ValueView)
|
||||||
|
.chain(std::iter::once(self.value()))
|
||||||
|
.chain(std::iter::once(self.is_searchable() as &dyn ValueView)),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -103,13 +114,14 @@ impl<'a> ObjectView for FieldValue<'a> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn contains_key(&self, index: &str) -> bool {
|
fn contains_key(&self, index: &str) -> bool {
|
||||||
index == "name" || index == "value"
|
index == "name" || index == "value" || index == "is_searchable"
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get<'s>(&'s self, index: &str) -> Option<&'s dyn ValueView> {
|
fn get<'s>(&'s self, index: &str) -> Option<&'s dyn ValueView> {
|
||||||
match index {
|
match index {
|
||||||
"name" => Some(self.name()),
|
"name" => Some(self.name()),
|
||||||
"value" => Some(self.value()),
|
"value" => Some(self.value()),
|
||||||
|
"is_searchable" => Some(self.is_searchable()),
|
||||||
_ => None,
|
_ => None,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -4,14 +4,16 @@ pub(crate) mod error;
|
|||||||
mod fields;
|
mod fields;
|
||||||
mod template_checker;
|
mod template_checker;
|
||||||
|
|
||||||
|
use std::collections::BTreeMap;
|
||||||
use std::convert::TryFrom;
|
use std::convert::TryFrom;
|
||||||
|
use std::ops::Deref;
|
||||||
|
|
||||||
use error::{NewPromptError, RenderPromptError};
|
use error::{NewPromptError, RenderPromptError};
|
||||||
|
|
||||||
use self::context::Context;
|
use self::context::Context;
|
||||||
use self::document::Document;
|
use self::document::Document;
|
||||||
use crate::update::del_add::DelAdd;
|
use crate::update::del_add::DelAdd;
|
||||||
use crate::FieldsIdsMap;
|
use crate::{FieldId, FieldsIdsMap};
|
||||||
|
|
||||||
pub struct Prompt {
|
pub struct Prompt {
|
||||||
template: liquid::Template,
|
template: liquid::Template,
|
||||||
@ -53,8 +55,10 @@ fn default_template() -> liquid::Template {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn default_template_text() -> &'static str {
|
fn default_template_text() -> &'static str {
|
||||||
"{% for field in fields %} \
|
"{% for field in fields %}\
|
||||||
|
{% if field.is_searchable and field.value != nil %}\
|
||||||
{{ field.name }}: {{ field.value }}\n\
|
{{ field.name }}: {{ field.value }}\n\
|
||||||
|
{% endif %}\
|
||||||
{% endfor %}"
|
{% endfor %}"
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -93,7 +97,7 @@ impl Prompt {
|
|||||||
&self,
|
&self,
|
||||||
document: obkv::KvReaderU16<'_>,
|
document: obkv::KvReaderU16<'_>,
|
||||||
side: DelAdd,
|
side: DelAdd,
|
||||||
field_id_map: &FieldsIdsMap,
|
field_id_map: &FieldsIdsMapWithMetadata,
|
||||||
) -> Result<String, RenderPromptError> {
|
) -> Result<String, RenderPromptError> {
|
||||||
let document = Document::new(document, side, field_id_map);
|
let document = Document::new(document, side, field_id_map);
|
||||||
let context = Context::new(&document, field_id_map);
|
let context = Context::new(&document, field_id_map);
|
||||||
@ -102,6 +106,40 @@ impl Prompt {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub struct FieldsIdsMapWithMetadata<'a> {
|
||||||
|
fields_ids_map: &'a FieldsIdsMap,
|
||||||
|
metadata: BTreeMap<FieldId, FieldMetadata>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> FieldsIdsMapWithMetadata<'a> {
|
||||||
|
pub fn new(fields_ids_map: &'a FieldsIdsMap, searchable_fields_ids: &'_ [FieldId]) -> Self {
|
||||||
|
let mut metadata: BTreeMap<FieldId, FieldMetadata> =
|
||||||
|
fields_ids_map.ids().map(|id| (id, Default::default())).collect();
|
||||||
|
for searchable_field_id in searchable_fields_ids {
|
||||||
|
let Some(metadata) = metadata.get_mut(searchable_field_id) else { continue };
|
||||||
|
metadata.searchable = true;
|
||||||
|
}
|
||||||
|
Self { fields_ids_map, metadata }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn metadata(&self, field_id: FieldId) -> Option<FieldMetadata> {
|
||||||
|
self.metadata.get(&field_id).copied()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> Deref for FieldsIdsMapWithMetadata<'a> {
|
||||||
|
type Target = FieldsIdsMap;
|
||||||
|
|
||||||
|
fn deref(&self) -> &Self::Target {
|
||||||
|
self.fields_ids_map
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Default, Clone, Copy)]
|
||||||
|
pub struct FieldMetadata {
|
||||||
|
pub searchable: bool,
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod test {
|
mod test {
|
||||||
use super::Prompt;
|
use super::Prompt;
|
||||||
|
@ -15,14 +15,14 @@ use serde_json::Value;
|
|||||||
use super::helpers::{create_writer, writer_into_reader, GrenadParameters};
|
use super::helpers::{create_writer, writer_into_reader, GrenadParameters};
|
||||||
use crate::error::FaultSource;
|
use crate::error::FaultSource;
|
||||||
use crate::index::IndexEmbeddingConfig;
|
use crate::index::IndexEmbeddingConfig;
|
||||||
use crate::prompt::Prompt;
|
use crate::prompt::{FieldsIdsMapWithMetadata, Prompt};
|
||||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||||
use crate::update::settings::InnerIndexSettingsDiff;
|
use crate::update::settings::InnerIndexSettingsDiff;
|
||||||
use crate::vector::error::{EmbedErrorKind, PossibleEmbeddingMistakes, UnusedVectorsDistribution};
|
use crate::vector::error::{EmbedErrorKind, PossibleEmbeddingMistakes, UnusedVectorsDistribution};
|
||||||
use crate::vector::parsed_vectors::{ParsedVectorsDiff, VectorState, RESERVED_VECTORS_FIELD_NAME};
|
use crate::vector::parsed_vectors::{ParsedVectorsDiff, VectorState, RESERVED_VECTORS_FIELD_NAME};
|
||||||
use crate::vector::settings::{EmbedderAction, ReindexAction};
|
use crate::vector::settings::{EmbedderAction, ReindexAction};
|
||||||
use crate::vector::{Embedder, Embeddings};
|
use crate::vector::{Embedder, Embeddings};
|
||||||
use crate::{try_split_array_at, DocumentId, FieldId, FieldsIdsMap, Result, ThreadPoolNoAbort};
|
use crate::{try_split_array_at, DocumentId, FieldId, Result, ThreadPoolNoAbort};
|
||||||
|
|
||||||
/// The length of the elements that are always in the buffer when inserting new values.
|
/// The length of the elements that are always in the buffer when inserting new values.
|
||||||
const TRUNCATE_SIZE: usize = size_of::<DocumentId>();
|
const TRUNCATE_SIZE: usize = size_of::<DocumentId>();
|
||||||
@ -189,7 +189,13 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
|||||||
let reindex_vectors = settings_diff.reindex_vectors();
|
let reindex_vectors = settings_diff.reindex_vectors();
|
||||||
|
|
||||||
let old_fields_ids_map = &settings_diff.old.fields_ids_map;
|
let old_fields_ids_map = &settings_diff.old.fields_ids_map;
|
||||||
|
let old_fields_ids_map =
|
||||||
|
FieldsIdsMapWithMetadata::new(old_fields_ids_map, &settings_diff.old.searchable_fields_ids);
|
||||||
|
|
||||||
let new_fields_ids_map = &settings_diff.new.fields_ids_map;
|
let new_fields_ids_map = &settings_diff.new.fields_ids_map;
|
||||||
|
let new_fields_ids_map =
|
||||||
|
FieldsIdsMapWithMetadata::new(new_fields_ids_map, &settings_diff.new.searchable_fields_ids);
|
||||||
|
|
||||||
// the vector field id may have changed
|
// the vector field id may have changed
|
||||||
let old_vectors_fid = old_fields_ids_map.id(RESERVED_VECTORS_FIELD_NAME);
|
let old_vectors_fid = old_fields_ids_map.id(RESERVED_VECTORS_FIELD_NAME);
|
||||||
|
|
||||||
@ -376,7 +382,7 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
|||||||
);
|
);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
regenerate_prompt(obkv, prompt, new_fields_ids_map)?
|
regenerate_prompt(obkv, prompt, &new_fields_ids_map)?
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
// prompt regeneration is only triggered for existing embedders
|
// prompt regeneration is only triggered for existing embedders
|
||||||
@ -393,7 +399,7 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
|||||||
regenerate_if_prompt_changed(
|
regenerate_if_prompt_changed(
|
||||||
obkv,
|
obkv,
|
||||||
(old_prompt, prompt),
|
(old_prompt, prompt),
|
||||||
(old_fields_ids_map, new_fields_ids_map),
|
(&old_fields_ids_map, &new_fields_ids_map),
|
||||||
)?
|
)?
|
||||||
} else {
|
} else {
|
||||||
// we can simply ignore user provided vectors as they are not regenerated and are
|
// we can simply ignore user provided vectors as they are not regenerated and are
|
||||||
@ -409,7 +415,7 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
|||||||
prompt,
|
prompt,
|
||||||
(add_to_user_provided, remove_from_user_provided),
|
(add_to_user_provided, remove_from_user_provided),
|
||||||
(old, new),
|
(old, new),
|
||||||
(old_fields_ids_map, new_fields_ids_map),
|
(&old_fields_ids_map, &new_fields_ids_map),
|
||||||
document_id,
|
document_id,
|
||||||
embedder_name,
|
embedder_name,
|
||||||
embedder_is_manual,
|
embedder_is_manual,
|
||||||
@ -479,7 +485,10 @@ fn extract_vector_document_diff(
|
|||||||
prompt: &Prompt,
|
prompt: &Prompt,
|
||||||
(add_to_user_provided, remove_from_user_provided): (&mut RoaringBitmap, &mut RoaringBitmap),
|
(add_to_user_provided, remove_from_user_provided): (&mut RoaringBitmap, &mut RoaringBitmap),
|
||||||
(old, new): (VectorState, VectorState),
|
(old, new): (VectorState, VectorState),
|
||||||
(old_fields_ids_map, new_fields_ids_map): (&FieldsIdsMap, &FieldsIdsMap),
|
(old_fields_ids_map, new_fields_ids_map): (
|
||||||
|
&FieldsIdsMapWithMetadata,
|
||||||
|
&FieldsIdsMapWithMetadata,
|
||||||
|
),
|
||||||
document_id: impl Fn() -> Value,
|
document_id: impl Fn() -> Value,
|
||||||
embedder_name: &str,
|
embedder_name: &str,
|
||||||
embedder_is_manual: bool,
|
embedder_is_manual: bool,
|
||||||
@ -599,7 +608,10 @@ fn extract_vector_document_diff(
|
|||||||
fn regenerate_if_prompt_changed(
|
fn regenerate_if_prompt_changed(
|
||||||
obkv: obkv::KvReader<'_, FieldId>,
|
obkv: obkv::KvReader<'_, FieldId>,
|
||||||
(old_prompt, new_prompt): (&Prompt, &Prompt),
|
(old_prompt, new_prompt): (&Prompt, &Prompt),
|
||||||
(old_fields_ids_map, new_fields_ids_map): (&FieldsIdsMap, &FieldsIdsMap),
|
(old_fields_ids_map, new_fields_ids_map): (
|
||||||
|
&FieldsIdsMapWithMetadata,
|
||||||
|
&FieldsIdsMapWithMetadata,
|
||||||
|
),
|
||||||
) -> Result<VectorStateDelta> {
|
) -> Result<VectorStateDelta> {
|
||||||
let old_prompt =
|
let old_prompt =
|
||||||
old_prompt.render(obkv, DelAdd::Deletion, old_fields_ids_map).unwrap_or(Default::default());
|
old_prompt.render(obkv, DelAdd::Deletion, old_fields_ids_map).unwrap_or(Default::default());
|
||||||
@ -614,7 +626,7 @@ fn regenerate_if_prompt_changed(
|
|||||||
fn regenerate_prompt(
|
fn regenerate_prompt(
|
||||||
obkv: obkv::KvReader<'_, FieldId>,
|
obkv: obkv::KvReader<'_, FieldId>,
|
||||||
prompt: &Prompt,
|
prompt: &Prompt,
|
||||||
new_fields_ids_map: &FieldsIdsMap,
|
new_fields_ids_map: &FieldsIdsMapWithMetadata,
|
||||||
) -> Result<VectorStateDelta> {
|
) -> Result<VectorStateDelta> {
|
||||||
let prompt = prompt.render(obkv, DelAdd::Addition, new_fields_ids_map)?;
|
let prompt = prompt.render(obkv, DelAdd::Addition, new_fields_ids_map)?;
|
||||||
|
|
||||||
|
@ -1238,7 +1238,7 @@ impl InnerIndexSettingsDiff {
|
|||||||
old_settings: InnerIndexSettings,
|
old_settings: InnerIndexSettings,
|
||||||
new_settings: InnerIndexSettings,
|
new_settings: InnerIndexSettings,
|
||||||
primary_key_id: Option<FieldId>,
|
primary_key_id: Option<FieldId>,
|
||||||
embedding_config_updates: BTreeMap<String, EmbedderAction>,
|
mut embedding_config_updates: BTreeMap<String, EmbedderAction>,
|
||||||
settings_update_only: bool,
|
settings_update_only: bool,
|
||||||
) -> Self {
|
) -> Self {
|
||||||
let only_additional_fields = match (
|
let only_additional_fields = match (
|
||||||
@ -1273,6 +1273,32 @@ impl InnerIndexSettingsDiff {
|
|||||||
let cache_user_defined_searchables = old_settings.user_defined_searchable_fields
|
let cache_user_defined_searchables = old_settings.user_defined_searchable_fields
|
||||||
!= new_settings.user_defined_searchable_fields;
|
!= new_settings.user_defined_searchable_fields;
|
||||||
|
|
||||||
|
// if the user-defined searchables changed, then we need to reindex prompts.
|
||||||
|
if cache_user_defined_searchables {
|
||||||
|
for (embedder_name, (config, _)) in new_settings.embedding_configs.inner_as_ref() {
|
||||||
|
// skip embedders that don't use document templates
|
||||||
|
if !config.uses_document_template() {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// note: this could currently be entry.or_insert(..), but we're future-proofing with an explicit match
|
||||||
|
// this always makes the code clearer by explicitly handling the cases
|
||||||
|
match embedding_config_updates.entry(embedder_name.clone()) {
|
||||||
|
std::collections::btree_map::Entry::Vacant(entry) => {
|
||||||
|
entry.insert(EmbedderAction::Reindex(ReindexAction::RegeneratePrompts));
|
||||||
|
}
|
||||||
|
std::collections::btree_map::Entry::Occupied(entry) => match entry.get() {
|
||||||
|
EmbedderAction::WriteBackToDocuments(_) => { /* we are deleting this embedder, so no point in regeneration */
|
||||||
|
}
|
||||||
|
EmbedderAction::Reindex(ReindexAction::FullReindex) => { /* we are already fully reindexing */
|
||||||
|
}
|
||||||
|
EmbedderAction::Reindex(ReindexAction::RegeneratePrompts) => { /* we are already regenerating prompts */
|
||||||
|
}
|
||||||
|
},
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
InnerIndexSettingsDiff {
|
InnerIndexSettingsDiff {
|
||||||
old: old_settings,
|
old: old_settings,
|
||||||
new: new_settings,
|
new: new_settings,
|
||||||
|
@ -305,6 +305,16 @@ impl Embedder {
|
|||||||
Embedder::Rest(embedder) => embedder.distribution(),
|
Embedder::Rest(embedder) => embedder.distribution(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn uses_document_template(&self) -> bool {
|
||||||
|
match self {
|
||||||
|
Embedder::HuggingFace(_)
|
||||||
|
| Embedder::OpenAi(_)
|
||||||
|
| Embedder::Ollama(_)
|
||||||
|
| Embedder::Rest(_) => true,
|
||||||
|
Embedder::UserProvided(_) => false,
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Describes the mean and sigma of distribution of embedding similarity in the embedding space.
|
/// Describes the mean and sigma of distribution of embedding similarity in the embedding space.
|
||||||
|
Loading…
Reference in New Issue
Block a user