diff --git a/index-scheduler/src/index_mapper/mod.rs b/index-scheduler/src/index_mapper/mod.rs index 14908120c..3cccb5a69 100644 --- a/index-scheduler/src/index_mapper/mod.rs +++ b/index-scheduler/src/index_mapper/mod.rs @@ -108,8 +108,10 @@ pub struct IndexStats { /// Association of every field name with the number of times it occurs in the documents. pub field_distribution: FieldDistribution, /// Creation date of the index. + #[serde(with = "time::serde::rfc3339")] pub created_at: OffsetDateTime, /// Date of the last update of the index. + #[serde(with = "time::serde::rfc3339")] pub updated_at: OffsetDateTime, } diff --git a/meilisearch/src/main.rs b/meilisearch/src/main.rs index e881734fb..2e70b4eb7 100644 --- a/meilisearch/src/main.rs +++ b/meilisearch/src/main.rs @@ -72,6 +72,19 @@ fn on_panic(info: &std::panic::PanicInfo) { #[actix_web::main] async fn main() -> anyhow::Result<()> { + try_main().await.inspect_err(|error| { + tracing::error!(%error); + let mut current = error.source(); + let mut depth = 0; + while let Some(source) = current { + tracing::info!(%source, depth, "Error caused by"); + current = source.source(); + depth += 1; + } + }) +} + +async fn try_main() -> anyhow::Result<()> { let (opt, config_read_from) = Opt::try_build()?; std::panic::set_hook(Box::new(on_panic)); diff --git a/meilisearch/src/routes/indexes/settings.rs b/meilisearch/src/routes/indexes/settings.rs index 6f081f1c7..e95a75f69 100644 --- a/meilisearch/src/routes/indexes/settings.rs +++ b/meilisearch/src/routes/indexes/settings.rs @@ -682,6 +682,7 @@ generate_configure!( filterable_attributes, sortable_attributes, displayed_attributes, + localized_attributes, searchable_attributes, distinct_attribute, proximity_precision, diff --git a/meilisearch/src/search/mod.rs b/meilisearch/src/search/mod.rs index dada9159b..915505be0 100644 --- a/meilisearch/src/search/mod.rs +++ b/meilisearch/src/search/mod.rs @@ -1369,12 +1369,18 @@ pub fn perform_facet_search( None => TimeBudget::default(), }; + // In the faceted search context, we want to use the intersection between the locales provided by the user + // and the locales of the facet string. + // If the facet string is not localized, we **ignore** the locales provided by the user because the facet data has no locale. + // If the user does not provide locales, we use the locales of the facet string. let localized_attributes = index.localized_attributes_rules(&rtxn)?.unwrap_or_default(); - let locales = locales.or_else(|| { - localized_attributes + let localized_attributes_locales = + localized_attributes.into_iter().find(|attr| attr.match_str(&facet_name)); + let locales = localized_attributes_locales.map(|attr| { + attr.locales .into_iter() - .find(|attr| attr.match_str(&facet_name)) - .map(|attr| attr.locales) + .filter(|locale| locales.as_ref().map_or(true, |locales| locales.contains(locale))) + .collect() }); let (search, _, _, _) = diff --git a/meilisearch/tests/search/locales.rs b/meilisearch/tests/search/locales.rs index 9f1c22b75..dbc4fcc30 100644 --- a/meilisearch/tests/search/locales.rs +++ b/meilisearch/tests/search/locales.rs @@ -386,12 +386,39 @@ async fn force_locales() { |response, code| { snapshot!(response, @r###" { - "hits": [], + "hits": [ + { + "name_zh": "进击的巨人", + "author_zh": "諫山創", + "description_zh": "进击的巨人是日本的漫画系列,由諫山 創作画。", + "id": 853, + "_vectors": { + "manual": [ + 1.0, + 2.0, + 3.0 + ] + }, + "_formatted": { + "name_zh": "巨人", + "author_zh": "諫山創", + "description_zh": "巨人是日本的漫画系列,由諫山 創作画。", + "id": "853", + "_vectors": { + "manual": [ + "1.0", + "2.0", + "3.0" + ] + } + } + } + ], "query": "\"进击的巨人\"", "processingTimeMs": "[duration]", "limit": 20, "offset": 0, - "estimatedTotalHits": 0 + "estimatedTotalHits": 1 } "###); snapshot!(code, @"200 OK"); @@ -483,12 +510,39 @@ async fn force_locales_with_pattern() { |response, code| { snapshot!(response, @r###" { - "hits": [], + "hits": [ + { + "name_zh": "进击的巨人", + "author_zh": "諫山創", + "description_zh": "进击的巨人是日本的漫画系列,由諫山 創作画。", + "id": 853, + "_vectors": { + "manual": [ + 1.0, + 2.0, + 3.0 + ] + }, + "_formatted": { + "name_zh": "巨人", + "author_zh": "諫山創", + "description_zh": "巨人是日本的漫画系列,由諫山 創作画。", + "id": "853", + "_vectors": { + "manual": [ + "1.0", + "2.0", + "3.0" + ] + } + } + } + ], "query": "\"进击的巨人\"", "processingTimeMs": "[duration]", "limit": 20, "offset": 0, - "estimatedTotalHits": 0 + "estimatedTotalHits": 1 } "###); snapshot!(code, @"200 OK"); @@ -761,6 +815,275 @@ async fn force_different_locales_with_pattern() { .await; } +#[actix_rt::test] +async fn auto_infer_locales_at_search_with_attributes_to_search_on() { + let server = Server::new().await; + + let index = server.index("test"); + let documents = DOCUMENTS.clone(); + let (response, _) = index + .update_settings( + json!({ + "searchableAttributes": ["name_en", "name_ja", "name_zh", "author_en", "author_ja", "author_zh", "description_en", "description_ja", "description_zh"], + "localizedAttributes": [ + // force japanese + {"attributePatterns": ["*_zh"], "locales": ["jpn"]}, + // force chinese + {"attributePatterns": ["*_ja"], "locales": ["cmn"]}, + // any language + {"attributePatterns": ["*_en"], "locales": []} + ] + }), + ) + .await; + snapshot!(response, @r###" + { + "taskUid": 0, + "indexUid": "test", + "status": "enqueued", + "type": "settingsUpdate", + "enqueuedAt": "[date]" + } + "###); + index.add_documents(documents, None).await; + index.wait_task(1).await; + + // auto infer any language + index + .search( + json!({"q": "\"进击的巨人\"", "attributesToHighlight": ["*"]}), + |response, code| { + snapshot!(response, @r###" + { + "hits": [], + "query": "\"进击的巨人\"", + "processingTimeMs": "[duration]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 0 + } + "###); + snapshot!(code, @"200 OK"); + }, + ) + .await; + + // should infer chinese + index + .search( + json!({"q": "\"进击的巨人\"", "attributesToHighlight": ["*"], "attributesToSearchOn": ["name_zh", "description_zh"]}), + |response, code| { + snapshot!(response, @r###" + { + "hits": [ + { + "name_zh": "进击的巨人", + "author_zh": "諫山創", + "description_zh": "进击的巨人是日本的漫画系列,由諫山 創作画。", + "id": 853, + "_vectors": { + "manual": [ + 1.0, + 2.0, + 3.0 + ] + }, + "_formatted": { + "name_zh": "巨人", + "author_zh": "諫山創", + "description_zh": "巨人是日本的漫画系列,由諫山 創作画。", + "id": "853", + "_vectors": { + "manual": [ + "1.0", + "2.0", + "3.0" + ] + } + } + } + ], + "query": "\"进击的巨人\"", + "processingTimeMs": "[duration]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 1 + } + "###); + snapshot!(code, @"200 OK"); + }, + ) + .await; +} + +#[actix_rt::test] +async fn auto_infer_locales_at_search() { + let server = Server::new().await; + + let index = server.index("test"); + let documents = DOCUMENTS.clone(); + let (response, _) = index + .update_settings( + json!({ + "searchableAttributes": ["name_en", "name_ja", "name_zh", "author_en", "author_ja", "author_zh", "description_en", "description_ja", "description_zh"], + "localizedAttributes": [ + // force japanese + {"attributePatterns": ["*"], "locales": ["jpn"]}, + ] + }), + ) + .await; + snapshot!(response, @r###" + { + "taskUid": 0, + "indexUid": "test", + "status": "enqueued", + "type": "settingsUpdate", + "enqueuedAt": "[date]" + } + "###); + index.add_documents(documents, None).await; + index.wait_task(1).await; + + index + .search( + json!({"q": "\"进击的巨人\"", "attributesToHighlight": ["*"]}), + |response, code| { + snapshot!(response, @r###" + { + "hits": [ + { + "name_zh": "进击的巨人", + "author_zh": "諫山創", + "description_zh": "进击的巨人是日本的漫画系列,由諫山 創作画。", + "id": 853, + "_vectors": { + "manual": [ + 1.0, + 2.0, + 3.0 + ] + }, + "_formatted": { + "name_zh": "巨人", + "author_zh": "諫山創", + "description_zh": "巨人是日本的漫画系列,由諫山 創作画。", + "id": "853", + "_vectors": { + "manual": [ + "1.0", + "2.0", + "3.0" + ] + } + } + } + ], + "query": "\"进击的巨人\"", + "processingTimeMs": "[duration]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 1 + } + "###); + snapshot!(code, @"200 OK"); + }, + ) + .await; + + index + .search( + json!({"q": "\"进击的巨人\"", "attributesToHighlight": ["*"]}), + |response, code| { + snapshot!(response, @r###" + { + "hits": [ + { + "name_zh": "进击的巨人", + "author_zh": "諫山創", + "description_zh": "进击的巨人是日本的漫画系列,由諫山 創作画。", + "id": 853, + "_vectors": { + "manual": [ + 1.0, + 2.0, + 3.0 + ] + }, + "_formatted": { + "name_zh": "巨人", + "author_zh": "諫山創", + "description_zh": "巨人是日本的漫画系列,由諫山 創作画。", + "id": "853", + "_vectors": { + "manual": [ + "1.0", + "2.0", + "3.0" + ] + } + } + } + ], + "query": "\"进击的巨人\"", + "processingTimeMs": "[duration]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 1 + } + "###); + snapshot!(code, @"200 OK"); + }, + ) + .await; + + index + .search( + json!({"q": "\"进击的巨人\"", "attributesToHighlight": ["*"]}), + |response, code| { + snapshot!(response, @r###" + { + "hits": [ + { + "name_zh": "进击的巨人", + "author_zh": "諫山創", + "description_zh": "进击的巨人是日本的漫画系列,由諫山 創作画。", + "id": 853, + "_vectors": { + "manual": [ + 1.0, + 2.0, + 3.0 + ] + }, + "_formatted": { + "name_zh": "巨人", + "author_zh": "諫山創", + "description_zh": "巨人是日本的漫画系列,由諫山 創作画。", + "id": "853", + "_vectors": { + "manual": [ + "1.0", + "2.0", + "3.0" + ] + } + } + } + ], + "query": "\"进击的巨人\"", + "processingTimeMs": "[duration]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 1 + } + "###); + snapshot!(code, @"200 OK"); + }, + ) + .await; +} + #[actix_rt::test] async fn force_different_locales_with_pattern_nested() { let server = Server::new().await; diff --git a/meilisearch/tests/search/mod.rs b/meilisearch/tests/search/mod.rs index 3c406cd5f..974025652 100644 --- a/meilisearch/tests/search/mod.rs +++ b/meilisearch/tests/search/mod.rs @@ -7,6 +7,7 @@ mod facet_search; mod formatted; mod geo; mod hybrid; +#[cfg(not(feature = "chinese-pinyin"))] mod locales; mod matching_strategy; mod multi; @@ -169,6 +170,7 @@ async fn negative_special_cases_search() { } #[cfg(feature = "default")] +#[cfg(not(feature = "chinese-pinyin"))] #[actix_rt::test] async fn test_kanji_language_detection() { let server = Server::new().await; diff --git a/meilisearch/tests/search/snapshots/errors.rs/distinct_at_search_time/task-succeed.snap b/meilisearch/tests/search/snapshots/errors.rs/distinct_at_search_time/task-succeed.snap index 903e96ffb..18532cba4 100644 --- a/meilisearch/tests/search/snapshots/errors.rs/distinct_at_search_time/task-succeed.snap +++ b/meilisearch/tests/search/snapshots/errors.rs/distinct_at_search_time/task-succeed.snap @@ -2,7 +2,7 @@ source: meilisearch/tests/search/errors.rs --- { - "uid": 0, + "uid": "[uid]", "indexUid": "tamo", "status": "succeeded", "type": "indexCreation", diff --git a/meilisearch/tests/settings/get_settings.rs b/meilisearch/tests/settings/get_settings.rs index 58805d54f..1571b8ca6 100644 --- a/meilisearch/tests/settings/get_settings.rs +++ b/meilisearch/tests/settings/get_settings.rs @@ -9,6 +9,7 @@ static DEFAULT_SETTINGS_VALUES: Lazy> = Lazy::new(| let mut map = HashMap::new(); map.insert("displayed_attributes", json!(["*"])); map.insert("searchable_attributes", json!(["*"])); + map.insert("localized_attributes", json!(null)); map.insert("filterable_attributes", json!([])); map.insert("distinct_attribute", json!(null)); map.insert( @@ -409,6 +410,7 @@ macro_rules! test_setting_routes { test_setting_routes!( filterable_attributes put, displayed_attributes put, + localized_attributes put, searchable_attributes put, distinct_attribute put, stop_words put, diff --git a/meilisearch/tests/vector/intel_gen.txt.gz b/meilisearch/tests/vector/intel_gen.txt.gz new file mode 100644 index 000000000..115eafea5 Binary files /dev/null and b/meilisearch/tests/vector/intel_gen.txt.gz differ diff --git a/meilisearch/tests/vector/mod.rs b/meilisearch/tests/vector/mod.rs index 66f1f87e7..7c9b375d9 100644 --- a/meilisearch/tests/vector/mod.rs +++ b/meilisearch/tests/vector/mod.rs @@ -1,3 +1,4 @@ +mod openai; mod rest; mod settings; @@ -10,6 +11,22 @@ use crate::common::index::Index; use crate::common::{default_settings, GetAllDocumentsOptions, Server}; use crate::json; +async fn get_server_vector() -> Server { + let server = Server::new().await; + let (value, code) = server.set_features(json!({"vectorStore": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(value, @r###" + { + "vectorStore": true, + "metrics": false, + "logsRoute": false, + "editDocumentsByFunction": false, + "containsFilter": false + } + "###); + server +} + #[actix_rt::test] async fn add_remove_user_provided() { let server = Server::new().await; diff --git a/meilisearch/tests/vector/openai.rs b/meilisearch/tests/vector/openai.rs new file mode 100644 index 000000000..f350abbe1 --- /dev/null +++ b/meilisearch/tests/vector/openai.rs @@ -0,0 +1,1873 @@ +use std::collections::BTreeMap; +use std::io::Write; +use std::sync::atomic::{AtomicU32, Ordering}; +use std::sync::OnceLock; + +use meili_snap::{json_string, snapshot}; +use wiremock::matchers::{method, path}; +use wiremock::{Mock, MockServer, Request, ResponseTemplate}; + +use crate::common::{GetAllDocumentsOptions, Value}; +use crate::json; +use crate::vector::get_server_vector; + +#[derive(serde::Deserialize)] +struct OpenAiResponses(BTreeMap); + +#[derive(serde::Deserialize)] +struct OpenAiResponse { + large: Option>, + small: Option>, + ada: Option>, + large_512: Option>, +} + +#[derive(serde::Deserialize)] +struct OpenAiTokenizedResponses { + tokens: Vec, + embedding: Vec, +} + +impl OpenAiResponses { + fn get(&self, text: &str, model_dimensions: ModelDimensions) -> Option<&[f32]> { + let entry = self.0.get(text)?; + match model_dimensions { + ModelDimensions::Large => entry.large.as_deref(), + ModelDimensions::Small => entry.small.as_deref(), + ModelDimensions::Ada => entry.ada.as_deref(), + ModelDimensions::Large512 => entry.large_512.as_deref(), + } + } +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum ModelDimensions { + Large, + Small, + Ada, + Large512, +} + +impl ModelDimensions { + fn add_to_settings(&self, settings: &mut Value) { + settings["model"] = serde_json::json!(self.model()); + if let ModelDimensions::Large512 = self { + settings["dimensions"] = serde_json::json!(512); + } + } + + fn model(&self) -> &'static str { + match self { + ModelDimensions::Large | ModelDimensions::Large512 => "text-embedding-3-large", + ModelDimensions::Small => "text-embedding-3-small", + ModelDimensions::Ada => "text-embedding-ada-002", + } + } + + fn from_request(request: &serde_json::Value) -> Self { + let has_dimensions_512 = if let Some(dimensions) = request.get("dimensions") { + if dimensions != 512 { + panic!("unsupported dimensions values") + } + true + } else { + false + }; + let serde_json::Value::String(model) = &request["model"] else { + panic!("unsupported non string model") + }; + match (model.as_str(), has_dimensions_512) { + ("text-embedding-3-large", true) => Self::Large512, + (_, true) => panic!("unsupported dimensions with non-large model"), + ("text-embedding-3-large", false) => Self::Large, + ("text-embedding-3-small", false) => Self::Small, + ("text-embedding-ada-002", false) => Self::Ada, + (_, false) => panic!("unsupported model"), + } + } +} + +fn openai_responses() -> &'static OpenAiResponses { + static OPENAI_RESPONSES: OnceLock = OnceLock::new(); + OPENAI_RESPONSES.get_or_init(|| { + // json file that was compressed with gzip + // decompress with `gzip --keep -d openai_responses.json.gz` + // recompress with `gzip --keep -c openai_responses.json > openai_responses.json.gz` + let compressed_responses = include_bytes!("openai_responses.json.gz"); + let mut responses = Vec::new(); + let mut decoder = flate2::write::GzDecoder::new(&mut responses); + + decoder.write_all(compressed_responses).unwrap(); + drop(decoder); + serde_json::from_slice(&responses).unwrap() + }) +} + +fn openai_tokenized_responses() -> &'static OpenAiTokenizedResponses { + static OPENAI_TOKENIZED_RESPONSES: OnceLock = OnceLock::new(); + OPENAI_TOKENIZED_RESPONSES.get_or_init(|| { + // json file that was compressed with gzip + // decompress with `gzip --keep -d openai_tokenized_responses.json.gz` + // recompress with `gzip --keep -c openai_tokenized_responses.json > openai_tokenized_responses.json.gz` + let compressed_responses = include_bytes!("openai_tokenized_responses.json.gz"); + let mut responses = Vec::new(); + let mut decoder = flate2::write::GzDecoder::new(&mut responses); + + decoder.write_all(compressed_responses).unwrap(); + drop(decoder); + serde_json::from_slice(&responses).unwrap() + }) +} + +fn long_text() -> &'static str { + static LONG_TEXT: OnceLock = OnceLock::new(); + LONG_TEXT.get_or_init(|| { + // decompress with `gzip --keep -d intel_gen.txt.gz` + // recompress with `gzip --keep -c intel_gen.txt > intel_gen.txt.gz` + let compressed_long_text = include_bytes!("intel_gen.txt.gz"); + let mut long_text = Vec::new(); + let mut decoder = flate2::write::GzDecoder::new(&mut long_text); + + decoder.write_all(compressed_long_text).unwrap(); + drop(decoder); + let long_text = std::str::from_utf8(&long_text).unwrap(); + + long_text.repeat(3) + }) +} + +async fn create_mock_tokenized() -> (MockServer, Value) { + create_mock_with_template("{{doc.text}}", ModelDimensions::Large, false).await +} + +async fn create_mock_with_template( + document_template: &str, + model_dimensions: ModelDimensions, + fallible: bool, +) -> (MockServer, Value) { + let mock_server = MockServer::start().await; + const API_KEY: &str = "my-api-key"; + const API_KEY_BEARER: &str = "Bearer my-api-key"; + + let attempt = AtomicU32::new(0); + + Mock::given(method("POST")) + .and(path("/")) + .respond_with(move |req: &Request| { + // 0. maybe return 500 + if fallible { + let attempt = attempt.fetch_add(1, Ordering::Relaxed); + let failed = matches!(attempt % 4, 0 | 1 | 3); + if failed { + return ResponseTemplate::new(503).set_body_json(json!({ + "error": { + "message": "come back later", + "type": "come_back_later" + } + })) + } + } + // 1. check API key + match req.headers.get("Authorization") { + Some(api_key) if api_key == API_KEY_BEARER => { + {} + } + Some(api_key) => { + let api_key = api_key.to_str().unwrap(); + return ResponseTemplate::new(401).set_body_json( + json!( + { + "error": { + "message": format!("Incorrect API key provided: {api_key}. You can find your API key at https://platform.openai.com/account/api-keys."), + "type": "invalid_request_error", + "param": serde_json::Value::Null, + "code": "invalid_api_key" + } + } + ), + ) + } + None => { + return ResponseTemplate::new(401).set_body_json( + json!( + { + "error": { + "message": "You didn't provide an API key. You need to provide your API key in an Authorization header using Bearer auth (i.e. Authorization: Bearer YOUR_KEY), or as the password field (with blank username) if you're accessing the API from your browser and are prompted for a username and password. You can obtain an API key from https://platform.openai.com/account/api-keys.", + "type": "invalid_request_error", + "param": serde_json::Value::Null, + "code": serde_json::Value::Null + } + } + ), + ) + } + } + // 2. parse text inputs + let query: serde_json::Value = match req.body_json() { + Ok(query) => query, + Err(_error) => return ResponseTemplate::new(400).set_body_json( + json!( + { + "error": { + "message": "We could not parse the JSON body of your request. (HINT: This likely means you aren't using your HTTP library correctly. The OpenAI API expects a JSON payload, but what was sent was not valid JSON. If you have trouble figuring out how to fix this, please contact us through our help center at help.openai.com.)", + "type": "invalid_request_error", + "param": serde_json::Value::Null, + "code": serde_json::Value::Null + } + } + ) + ) + }; + let query_model_dimensions = ModelDimensions::from_request(&query); + if query_model_dimensions != model_dimensions { + panic!("Expected {model_dimensions:?}, got {query_model_dimensions:?}") + } + + // 3. for each text, find embedding in responses + let serde_json::Value::Array(inputs) = &query["input"] else { + panic!("Unexpected `input` value") + }; + + let openai_tokenized_responses = openai_tokenized_responses(); + let embeddings = if inputs == openai_tokenized_responses.tokens.as_slice() { + vec![openai_tokenized_responses.embedding.clone()] + } else { + let mut embeddings = Vec::new(); + for input in inputs { + let serde_json::Value::String(input) = input else { + return ResponseTemplate::new(400).set_body_json(json!({ + "error": { + "message": "Unexpected `input` value", + "type": "test_response", + "query": query + } + })) + }; + + if input == long_text() { + return ResponseTemplate::new(400).set_body_json(json!( + { + "error": { + "message": "This model's maximum context length is 8192 tokens, however you requested 10554 tokens (10554 in your prompt; 0 for the completion). Please reduce your prompt; or completion length.", + "type": "invalid_request_error", + "param": null, + "code": null, + } + } + )); + } + + let Some(embedding) = openai_responses().get(input, model_dimensions) else { + return ResponseTemplate::new(404).set_body_json(json!( + { + "error": { + "message": "Could not find embedding for text", + "text": input, + "model_dimensions": format!("{model_dimensions:?}"), + "type": "add_to_openai_responses_json_please", + "query": query, + } + } + )) + }; + + embeddings.push(embedding.to_vec()); + } + embeddings + }; + + + let data : Vec<_> = embeddings.into_iter().enumerate().map(|(index, embedding)| json!({ + "object": "embedding", + "index": index, + "embedding": embedding, + })).collect(); + + // 4. produce output from embeddings + ResponseTemplate::new(200).set_body_json(json!({ + "object": "list", + "data": data, + "model": model_dimensions.model(), + "usage": { + "prompt_tokens": "[prompt_tokens]", + "total_tokens": "[total_tokens]" + } + })) + }) + .mount(&mock_server) + .await; + let url = mock_server.uri(); + + let mut embedder_settings = json!({ + "source": "openAi", + "url": url, + "apiKey": API_KEY, + "documentTemplate": document_template + }); + + model_dimensions.add_to_settings(&mut embedder_settings); + + (mock_server, embedder_settings) +} + +const DOGGO_TEMPLATE: &str = r#"{%- if doc.gender == "F" -%}Une chienne nommée {{doc.name}}, née en {{doc.birthyear}} + {%- else -%} + Un chien nommé {{doc.name}}, né en {{doc.birthyear}} + {%- endif %}, de race {{doc.breed}}."#; + +async fn create_mock() -> (MockServer, Value) { + create_mock_with_template(DOGGO_TEMPLATE, ModelDimensions::Large, false).await +} + +async fn create_mock_dimensions() -> (MockServer, Value) { + create_mock_with_template(DOGGO_TEMPLATE, ModelDimensions::Large512, false).await +} + +async fn create_mock_small_embedding_model() -> (MockServer, Value) { + create_mock_with_template(DOGGO_TEMPLATE, ModelDimensions::Small, false).await +} + +async fn create_mock_legacy_embedding_model() -> (MockServer, Value) { + create_mock_with_template(DOGGO_TEMPLATE, ModelDimensions::Ada, false).await +} + +async fn create_fallible_mock() -> (MockServer, Value) { + create_mock_with_template(DOGGO_TEMPLATE, ModelDimensions::Large, true).await +} + +// basic test "it works" +#[actix_rt::test] +async fn it_works() { + let (_mock, setting) = create_mock().await; + let server = get_server_vector().await; + let index = server.index("doggo"); + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "default": setting, + }, + })) + .await; + snapshot!(code, @"202 Accepted"); + let task = server.wait_task(response.uid()).await; + snapshot!(task["status"], @r###""succeeded""###); + let documents = json!([ + {"id": 0, "name": "kefir", "gender": "M", "birthyear": 2023, "breed": "Patou"}, + {"id": 1, "name": "Intel", "gender": "M", "birthyear": 2011, "breed": "Beagle"}, + {"id": 2, "name": "Vénus", "gender": "F", "birthyear": 2003, "breed": "Jack Russel Terrier"}, + {"id": 3, "name": "Max", "gender": "M", "birthyear": 1995, "breed": "Labrador Retriever"}, + ]); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + let task = index.wait_task(value.uid()).await; + snapshot!(task, @r###" + { + "uid": "[uid]", + "indexUid": "doggo", + "status": "succeeded", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 4, + "indexedDocuments": 4 + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); + + let (documents, _code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; + snapshot!(json_string!(documents, {".results.*._vectors.default.embeddings" => "[vector]"}), @r###" + { + "results": [ + { + "id": 0, + "name": "kefir", + "gender": "M", + "birthyear": 2023, + "breed": "Patou", + "_vectors": { + "default": { + "embeddings": "[vector]", + "regenerate": true + } + } + }, + { + "id": 1, + "name": "Intel", + "gender": "M", + "birthyear": 2011, + "breed": "Beagle", + "_vectors": { + "default": { + "embeddings": "[vector]", + "regenerate": true + } + } + }, + { + "id": 2, + "name": "Vénus", + "gender": "F", + "birthyear": 2003, + "breed": "Jack Russel Terrier", + "_vectors": { + "default": { + "embeddings": "[vector]", + "regenerate": true + } + } + }, + { + "id": 3, + "name": "Max", + "gender": "M", + "birthyear": 1995, + "breed": "Labrador Retriever", + "_vectors": { + "default": { + "embeddings": "[vector]", + "regenerate": true + } + } + } + ], + "offset": 0, + "limit": 20, + "total": 4 + } + "###); + + let (response, code) = index + .search_post(json!({ + "q": "chien de chasse", + "hybrid": {"semanticRatio": 1.0} + })) + .await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 1, + "name": "Intel", + "gender": "M", + "birthyear": 2011, + "breed": "Beagle" + }, + { + "id": 0, + "name": "kefir", + "gender": "M", + "birthyear": 2023, + "breed": "Patou" + }, + { + "id": 3, + "name": "Max", + "gender": "M", + "birthyear": 1995, + "breed": "Labrador Retriever" + }, + { + "id": 2, + "name": "Vénus", + "gender": "F", + "birthyear": 2003, + "breed": "Jack Russel Terrier" + } + ] + "###); + + let (response, code) = index + .search_post(json!({ + "q": "petit chien", + "hybrid": {"semanticRatio": 1.0} + })) + .await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 1, + "name": "Intel", + "gender": "M", + "birthyear": 2011, + "breed": "Beagle" + }, + { + "id": 2, + "name": "Vénus", + "gender": "F", + "birthyear": 2003, + "breed": "Jack Russel Terrier" + }, + { + "id": 0, + "name": "kefir", + "gender": "M", + "birthyear": 2023, + "breed": "Patou" + }, + { + "id": 3, + "name": "Max", + "gender": "M", + "birthyear": 1995, + "breed": "Labrador Retriever" + } + ] + "###); + + let (response, code) = index + .search_post(json!({ + "q": "grand chien de berger des montagnes", + "hybrid": {"semanticRatio": 1.0} + })) + .await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 0, + "name": "kefir", + "gender": "M", + "birthyear": 2023, + "breed": "Patou" + }, + { + "id": 1, + "name": "Intel", + "gender": "M", + "birthyear": 2011, + "breed": "Beagle" + }, + { + "id": 3, + "name": "Max", + "gender": "M", + "birthyear": 1995, + "breed": "Labrador Retriever" + }, + { + "id": 2, + "name": "Vénus", + "gender": "F", + "birthyear": 2003, + "breed": "Jack Russel Terrier" + } + ] + "###); +} + +// tokenize long text + +// basic test "it works" +#[actix_rt::test] +async fn tokenize_long_text() { + let (_mock, setting) = create_mock_tokenized().await; + let server = get_server_vector().await; + let index = server.index("doggo"); + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "default": setting, + }, + })) + .await; + snapshot!(code, @"202 Accepted"); + let task = server.wait_task(response.uid()).await; + snapshot!(task["status"], @r###""succeeded""###); + let documents = json!([ + {"id": 0, "text": long_text()} + ]); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + let task = index.wait_task(value.uid()).await; + snapshot!(task, @r###" + { + "uid": "[uid]", + "indexUid": "doggo", + "status": "succeeded", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 1 + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); + + let (response, code) = index + .search_post(json!({ + "q": "grand chien de berger des montagnes", + "showRankingScore": true, + "attributesToRetrieve": ["id"], + "hybrid": {"semanticRatio": 1.0} + })) + .await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 0, + "_rankingScore": 0.07944583892822266 + } + ] + "###); +} + +// "wrong parameters" + +#[actix_rt::test] +async fn bad_api_key() { + let (_mock, mut setting) = create_mock().await; + let server = get_server_vector().await; + let index = server.index("doggo"); + + let documents = json!([ + {"id": 0, "name": "kefir", "gender": "M", "birthyear": 2023, "breed": "Patou"}, + {"id": 1, "name": "Intel", "gender": "M", "birthyear": 2011, "breed": "Beagle"}, + {"id": 2, "name": "Vénus", "gender": "F", "birthyear": 2003, "breed": "Jack Russel Terrier"}, + {"id": 3, "name": "Max", "gender": "M", "birthyear": 1995, "breed": "Labrador Retriever"}, + ]); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + let task = index.wait_task(value.uid()).await; + + snapshot!(task, @r###" + { + "uid": "[uid]", + "indexUid": "doggo", + "status": "succeeded", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 4, + "indexedDocuments": 4 + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); + + // wrong API key + setting["apiKey"] = "doggo".into(); + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "default": setting, + }, + })) + .await; + snapshot!(code, @"202 Accepted"); + let task = server.wait_task(response.uid()).await; + + snapshot!(task, @r###" + { + "uid": "[uid]", + "indexUid": "doggo", + "status": "failed", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "default": { + "source": "openAi", + "model": "text-embedding-3-large", + "apiKey": "XXX...", + "documentTemplate": "{%- if doc.gender == \"F\" -%}Une chienne nommée {{doc.name}}, née en {{doc.birthyear}}\n {%- else -%}\n Un chien nommé {{doc.name}}, né en {{doc.birthyear}}\n {%- endif %}, de race {{doc.breed}}.", + "url": "[url]" + } + } + }, + "error": { + "message": "While embedding documents for embedder `default`: user error: could not authenticate against OpenAI server\n - server replied with `{\"error\":{\"message\":\"Incorrect API key provided: Bearer doggo. You can find your API key at https://platform.openai.com/account/api-keys.\",\"type\":\"invalid_request_error\",\"param\":null,\"code\":\"invalid_api_key\"}}`\n - Hint: Check the `apiKey` parameter in the embedder configuration, and the `MEILI_OPENAI_API_KEY` and `OPENAI_API_KEY` environment variables", + "code": "vector_embedding_error", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#vector_embedding_error" + }, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); + + // no API key + setting.as_object_mut().unwrap().remove("apiKey"); + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "default": setting, + }, + })) + .await; + snapshot!(code, @"202 Accepted"); + let task = server.wait_task(response.uid()).await; + snapshot!(task, @r###" + { + "uid": "[uid]", + "indexUid": "doggo", + "status": "failed", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "default": { + "source": "openAi", + "model": "text-embedding-3-large", + "documentTemplate": "{%- if doc.gender == \"F\" -%}Une chienne nommée {{doc.name}}, née en {{doc.birthyear}}\n {%- else -%}\n Un chien nommé {{doc.name}}, né en {{doc.birthyear}}\n {%- endif %}, de race {{doc.breed}}.", + "url": "[url]" + } + } + }, + "error": { + "message": "While embedding documents for embedder `default`: user error: could not authenticate against OpenAI server\n - server replied with `{\"error\":{\"message\":\"You didn't provide an API key. You need to provide your API key in an Authorization header using Bearer auth (i.e. Authorization: Bearer YOUR_KEY), or as the password field (with blank username) if you're accessing the API from your browser and are prompted for a username and password. You can obtain an API key from https://platform.openai.com/account/api-keys.\",\"type\":\"invalid_request_error\",\"param\":null,\"code\":null}}`\n - Hint: Check the `apiKey` parameter in the embedder configuration, and the `MEILI_OPENAI_API_KEY` and `OPENAI_API_KEY` environment variables", + "code": "vector_embedding_error", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#vector_embedding_error" + }, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); + + // not a string API key + setting["apiKey"] = 42.into(); + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "default": setting, + }, + })) + .await; + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r###" + { + "message": "Invalid value type at `.embedders.default.apiKey`: expected a string, but found a positive integer: `42`", + "code": "invalid_settings_embedders", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_settings_embedders" + } + "###); +} + +// one test with wrong model +#[actix_rt::test] +async fn bad_model() { + let (_mock, mut setting) = create_mock().await; + let server = get_server_vector().await; + let index = server.index("doggo"); + + let documents = json!([ + {"id": 0, "name": "kefir", "gender": "M", "birthyear": 2023, "breed": "Patou"}, + {"id": 1, "name": "Intel", "gender": "M", "birthyear": 2011, "breed": "Beagle"}, + {"id": 2, "name": "Vénus", "gender": "F", "birthyear": 2003, "breed": "Jack Russel Terrier"}, + {"id": 3, "name": "Max", "gender": "M", "birthyear": 1995, "breed": "Labrador Retriever"}, + ]); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + let task = index.wait_task(value.uid()).await; + + snapshot!(task, @r###" + { + "uid": "[uid]", + "indexUid": "doggo", + "status": "succeeded", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 4, + "indexedDocuments": 4 + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); + + // wrong model + setting["model"] = "doggo".into(); + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "default": setting, + }, + })) + .await; + snapshot!(code, @"400 Bad Request"); + + snapshot!(response, @r###" + { + "message": "`.embedders.default.model`: Invalid model `doggo` for OpenAI. Supported models: [\"text-embedding-ada-002\", \"text-embedding-3-small\", \"text-embedding-3-large\"]", + "code": "invalid_settings_embedders", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_settings_embedders" + } + "###); + + // not a string model + setting["model"] = 42.into(); + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "default": setting, + }, + })) + .await; + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r###" + { + "message": "Invalid value type at `.embedders.default.model`: expected a string, but found a positive integer: `42`", + "code": "invalid_settings_embedders", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_settings_embedders" + } + "###); +} + +#[actix_rt::test] +async fn bad_dimensions() { + let (_mock, mut setting) = create_mock().await; + let server = get_server_vector().await; + let index = server.index("doggo"); + + let documents = json!([ + {"id": 0, "name": "kefir", "gender": "M", "birthyear": 2023, "breed": "Patou"}, + {"id": 1, "name": "Intel", "gender": "M", "birthyear": 2011, "breed": "Beagle"}, + {"id": 2, "name": "Vénus", "gender": "F", "birthyear": 2003, "breed": "Jack Russel Terrier"}, + {"id": 3, "name": "Max", "gender": "M", "birthyear": 1995, "breed": "Labrador Retriever"}, + ]); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + let task = index.wait_task(value.uid()).await; + + snapshot!(task, @r###" + { + "uid": "[uid]", + "indexUid": "doggo", + "status": "succeeded", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 4, + "indexedDocuments": 4 + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); + + // null dimensions + setting["dimensions"] = 0.into(); + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "default": setting, + }, + })) + .await; + snapshot!(code, @"400 Bad Request"); + + snapshot!(response, @r###" + { + "message": "`.embedders.default.dimensions`: `dimensions` cannot be zero", + "code": "invalid_settings_embedders", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_settings_embedders" + } + "###); + + // negative dimensions + setting["dimensions"] = (-42).into(); + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "default": setting, + }, + })) + .await; + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r###" + { + "message": "Invalid value type at `.embedders.default.dimensions`: expected a positive integer, but found a negative integer: `-42`", + "code": "invalid_settings_embedders", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_settings_embedders" + } + "###); + + // huge dimensions + setting["dimensions"] = (42_000_000).into(); + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "default": setting, + }, + })) + .await; + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r###" + { + "message": "`.embedders.default.dimensions`: Model `text-embedding-3-large` does not support overriding its dimensions to a value higher than 3072. Found 42000000", + "code": "invalid_settings_embedders", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_settings_embedders" + } + "###); +} + +// one test with changed dimensions +#[actix_rt::test] +async fn smaller_dimensions() { + let (_mock, setting) = create_mock_dimensions().await; + let server = get_server_vector().await; + let index = server.index("doggo"); + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "default": setting, + }, + })) + .await; + snapshot!(code, @"202 Accepted"); + let task = server.wait_task(response.uid()).await; + snapshot!(task["status"], @r###""succeeded""###); + let documents = json!([ + {"id": 0, "name": "kefir", "gender": "M", "birthyear": 2023, "breed": "Patou"}, + {"id": 1, "name": "Intel", "gender": "M", "birthyear": 2011, "breed": "Beagle"}, + {"id": 2, "name": "Vénus", "gender": "F", "birthyear": 2003, "breed": "Jack Russel Terrier"}, + {"id": 3, "name": "Max", "gender": "M", "birthyear": 1995, "breed": "Labrador Retriever"}, + ]); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + let task = index.wait_task(value.uid()).await; + snapshot!(task, @r###" + { + "uid": "[uid]", + "indexUid": "doggo", + "status": "succeeded", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 4, + "indexedDocuments": 4 + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); + + let (documents, _code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; + snapshot!(json_string!(documents, {".results.*._vectors.default.embeddings" => "[vector]"}), @r###" + { + "results": [ + { + "id": 0, + "name": "kefir", + "gender": "M", + "birthyear": 2023, + "breed": "Patou", + "_vectors": { + "default": { + "embeddings": "[vector]", + "regenerate": true + } + } + }, + { + "id": 1, + "name": "Intel", + "gender": "M", + "birthyear": 2011, + "breed": "Beagle", + "_vectors": { + "default": { + "embeddings": "[vector]", + "regenerate": true + } + } + }, + { + "id": 2, + "name": "Vénus", + "gender": "F", + "birthyear": 2003, + "breed": "Jack Russel Terrier", + "_vectors": { + "default": { + "embeddings": "[vector]", + "regenerate": true + } + } + }, + { + "id": 3, + "name": "Max", + "gender": "M", + "birthyear": 1995, + "breed": "Labrador Retriever", + "_vectors": { + "default": { + "embeddings": "[vector]", + "regenerate": true + } + } + } + ], + "offset": 0, + "limit": 20, + "total": 4 + } + "###); + + let (response, code) = index + .search_post(json!({ + "q": "chien de chasse", + "hybrid": {"semanticRatio": 1.0} + })) + .await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 1, + "name": "Intel", + "gender": "M", + "birthyear": 2011, + "breed": "Beagle" + }, + { + "id": 0, + "name": "kefir", + "gender": "M", + "birthyear": 2023, + "breed": "Patou" + }, + { + "id": 2, + "name": "Vénus", + "gender": "F", + "birthyear": 2003, + "breed": "Jack Russel Terrier" + }, + { + "id": 3, + "name": "Max", + "gender": "M", + "birthyear": 1995, + "breed": "Labrador Retriever" + } + ] + "###); + + let (response, code) = index + .search_post(json!({ + "q": "petit chien", + "hybrid": {"semanticRatio": 1.0} + })) + .await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 2, + "name": "Vénus", + "gender": "F", + "birthyear": 2003, + "breed": "Jack Russel Terrier" + }, + { + "id": 0, + "name": "kefir", + "gender": "M", + "birthyear": 2023, + "breed": "Patou" + }, + { + "id": 1, + "name": "Intel", + "gender": "M", + "birthyear": 2011, + "breed": "Beagle" + }, + { + "id": 3, + "name": "Max", + "gender": "M", + "birthyear": 1995, + "breed": "Labrador Retriever" + } + ] + "###); + + let (response, code) = index + .search_post(json!({ + "q": "grand chien de berger des montagnes", + "hybrid": {"semanticRatio": 1.0} + })) + .await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 0, + "name": "kefir", + "gender": "M", + "birthyear": 2023, + "breed": "Patou" + }, + { + "id": 3, + "name": "Max", + "gender": "M", + "birthyear": 1995, + "breed": "Labrador Retriever" + }, + { + "id": 1, + "name": "Intel", + "gender": "M", + "birthyear": 2011, + "breed": "Beagle" + }, + { + "id": 2, + "name": "Vénus", + "gender": "F", + "birthyear": 2003, + "breed": "Jack Russel Terrier" + } + ] + "###); +} + +// one test with different models +#[actix_rt::test] +async fn small_embedding_model() { + let (_mock, setting) = create_mock_small_embedding_model().await; + let server = get_server_vector().await; + let index = server.index("doggo"); + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "default": setting, + }, + })) + .await; + snapshot!(code, @"202 Accepted"); + let task = server.wait_task(response.uid()).await; + snapshot!(task["status"], @r###""succeeded""###); + let documents = json!([ + {"id": 0, "name": "kefir", "gender": "M", "birthyear": 2023, "breed": "Patou"}, + {"id": 1, "name": "Intel", "gender": "M", "birthyear": 2011, "breed": "Beagle"}, + {"id": 2, "name": "Vénus", "gender": "F", "birthyear": 2003, "breed": "Jack Russel Terrier"}, + {"id": 3, "name": "Max", "gender": "M", "birthyear": 1995, "breed": "Labrador Retriever"}, + ]); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + let task = index.wait_task(value.uid()).await; + snapshot!(task, @r###" + { + "uid": "[uid]", + "indexUid": "doggo", + "status": "succeeded", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 4, + "indexedDocuments": 4 + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); + + let (documents, _code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; + snapshot!(json_string!(documents, {".results.*._vectors.default.embeddings" => "[vector]"}), @r###" + { + "results": [ + { + "id": 0, + "name": "kefir", + "gender": "M", + "birthyear": 2023, + "breed": "Patou", + "_vectors": { + "default": { + "embeddings": "[vector]", + "regenerate": true + } + } + }, + { + "id": 1, + "name": "Intel", + "gender": "M", + "birthyear": 2011, + "breed": "Beagle", + "_vectors": { + "default": { + "embeddings": "[vector]", + "regenerate": true + } + } + }, + { + "id": 2, + "name": "Vénus", + "gender": "F", + "birthyear": 2003, + "breed": "Jack Russel Terrier", + "_vectors": { + "default": { + "embeddings": "[vector]", + "regenerate": true + } + } + }, + { + "id": 3, + "name": "Max", + "gender": "M", + "birthyear": 1995, + "breed": "Labrador Retriever", + "_vectors": { + "default": { + "embeddings": "[vector]", + "regenerate": true + } + } + } + ], + "offset": 0, + "limit": 20, + "total": 4 + } + "###); + + let (response, code) = index + .search_post(json!({ + "q": "chien de chasse", + "hybrid": {"semanticRatio": 1.0} + })) + .await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 3, + "name": "Max", + "gender": "M", + "birthyear": 1995, + "breed": "Labrador Retriever" + }, + { + "id": 1, + "name": "Intel", + "gender": "M", + "birthyear": 2011, + "breed": "Beagle" + }, + { + "id": 2, + "name": "Vénus", + "gender": "F", + "birthyear": 2003, + "breed": "Jack Russel Terrier" + }, + { + "id": 0, + "name": "kefir", + "gender": "M", + "birthyear": 2023, + "breed": "Patou" + } + ] + "###); + + let (response, code) = index + .search_post(json!({ + "q": "petit chien", + "hybrid": {"semanticRatio": 1.0} + })) + .await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 0, + "name": "kefir", + "gender": "M", + "birthyear": 2023, + "breed": "Patou" + }, + { + "id": 1, + "name": "Intel", + "gender": "M", + "birthyear": 2011, + "breed": "Beagle" + }, + { + "id": 3, + "name": "Max", + "gender": "M", + "birthyear": 1995, + "breed": "Labrador Retriever" + }, + { + "id": 2, + "name": "Vénus", + "gender": "F", + "birthyear": 2003, + "breed": "Jack Russel Terrier" + } + ] + "###); + + let (response, code) = index + .search_post(json!({ + "q": "grand chien de berger des montagnes", + "hybrid": {"semanticRatio": 1.0} + })) + .await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 3, + "name": "Max", + "gender": "M", + "birthyear": 1995, + "breed": "Labrador Retriever" + }, + { + "id": 0, + "name": "kefir", + "gender": "M", + "birthyear": 2023, + "breed": "Patou" + }, + { + "id": 1, + "name": "Intel", + "gender": "M", + "birthyear": 2011, + "breed": "Beagle" + }, + { + "id": 2, + "name": "Vénus", + "gender": "F", + "birthyear": 2003, + "breed": "Jack Russel Terrier" + } + ] + "###); +} + +#[actix_rt::test] +async fn legacy_embedding_model() { + let (_mock, setting) = create_mock_legacy_embedding_model().await; + let server = get_server_vector().await; + let index = server.index("doggo"); + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "default": setting, + }, + })) + .await; + snapshot!(code, @"202 Accepted"); + let task = server.wait_task(response.uid()).await; + snapshot!(task["status"], @r###""succeeded""###); + let documents = json!([ + {"id": 0, "name": "kefir", "gender": "M", "birthyear": 2023, "breed": "Patou"}, + {"id": 1, "name": "Intel", "gender": "M", "birthyear": 2011, "breed": "Beagle"}, + {"id": 2, "name": "Vénus", "gender": "F", "birthyear": 2003, "breed": "Jack Russel Terrier"}, + {"id": 3, "name": "Max", "gender": "M", "birthyear": 1995, "breed": "Labrador Retriever"}, + ]); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + let task = index.wait_task(value.uid()).await; + snapshot!(task, @r###" + { + "uid": "[uid]", + "indexUid": "doggo", + "status": "succeeded", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 4, + "indexedDocuments": 4 + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); + + let (documents, _code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; + snapshot!(json_string!(documents, {".results.*._vectors.default.embeddings" => "[vector]"}), @r###" + { + "results": [ + { + "id": 0, + "name": "kefir", + "gender": "M", + "birthyear": 2023, + "breed": "Patou", + "_vectors": { + "default": { + "embeddings": "[vector]", + "regenerate": true + } + } + }, + { + "id": 1, + "name": "Intel", + "gender": "M", + "birthyear": 2011, + "breed": "Beagle", + "_vectors": { + "default": { + "embeddings": "[vector]", + "regenerate": true + } + } + }, + { + "id": 2, + "name": "Vénus", + "gender": "F", + "birthyear": 2003, + "breed": "Jack Russel Terrier", + "_vectors": { + "default": { + "embeddings": "[vector]", + "regenerate": true + } + } + }, + { + "id": 3, + "name": "Max", + "gender": "M", + "birthyear": 1995, + "breed": "Labrador Retriever", + "_vectors": { + "default": { + "embeddings": "[vector]", + "regenerate": true + } + } + } + ], + "offset": 0, + "limit": 20, + "total": 4 + } + "###); + + let (response, code) = index + .search_post(json!({ + "q": "chien de chasse", + "hybrid": {"semanticRatio": 1.0} + })) + .await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 2, + "name": "Vénus", + "gender": "F", + "birthyear": 2003, + "breed": "Jack Russel Terrier" + }, + { + "id": 3, + "name": "Max", + "gender": "M", + "birthyear": 1995, + "breed": "Labrador Retriever" + }, + { + "id": 1, + "name": "Intel", + "gender": "M", + "birthyear": 2011, + "breed": "Beagle" + }, + { + "id": 0, + "name": "kefir", + "gender": "M", + "birthyear": 2023, + "breed": "Patou" + } + ] + "###); + + let (response, code) = index + .search_post(json!({ + "q": "petit chien", + "hybrid": {"semanticRatio": 1.0} + })) + .await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 2, + "name": "Vénus", + "gender": "F", + "birthyear": 2003, + "breed": "Jack Russel Terrier" + }, + { + "id": 3, + "name": "Max", + "gender": "M", + "birthyear": 1995, + "breed": "Labrador Retriever" + }, + { + "id": 1, + "name": "Intel", + "gender": "M", + "birthyear": 2011, + "breed": "Beagle" + }, + { + "id": 0, + "name": "kefir", + "gender": "M", + "birthyear": 2023, + "breed": "Patou" + } + ] + "###); + + let (response, code) = index + .search_post(json!({ + "q": "grand chien de berger des montagnes", + "hybrid": {"semanticRatio": 1.0} + })) + .await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 3, + "name": "Max", + "gender": "M", + "birthyear": 1995, + "breed": "Labrador Retriever" + }, + { + "id": 2, + "name": "Vénus", + "gender": "F", + "birthyear": 2003, + "breed": "Jack Russel Terrier" + }, + { + "id": 1, + "name": "Intel", + "gender": "M", + "birthyear": 2011, + "breed": "Beagle" + }, + { + "id": 0, + "name": "kefir", + "gender": "M", + "birthyear": 2023, + "breed": "Patou" + } + ] + "###); +} + +// test with a server that responds 500 on 3 out of 4 calls +#[actix_rt::test] +async fn it_still_works() { + let (_mock, setting) = create_fallible_mock().await; + let server = get_server_vector().await; + let index = server.index("doggo"); + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "default": setting, + }, + })) + .await; + snapshot!(code, @"202 Accepted"); + let task = server.wait_task(response.uid()).await; + snapshot!(task["status"], @r###""succeeded""###); + let documents = json!([ + {"id": 0, "name": "kefir", "gender": "M", "birthyear": 2023, "breed": "Patou"}, + {"id": 1, "name": "Intel", "gender": "M", "birthyear": 2011, "breed": "Beagle"}, + {"id": 2, "name": "Vénus", "gender": "F", "birthyear": 2003, "breed": "Jack Russel Terrier"}, + {"id": 3, "name": "Max", "gender": "M", "birthyear": 1995, "breed": "Labrador Retriever"}, + ]); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + let task = index.wait_task(value.uid()).await; + snapshot!(task, @r###" + { + "uid": "[uid]", + "indexUid": "doggo", + "status": "succeeded", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 4, + "indexedDocuments": 4 + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); + + let (documents, _code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; + snapshot!(json_string!(documents, {".results.*._vectors.default.embeddings" => "[vector]"}), @r###" + { + "results": [ + { + "id": 0, + "name": "kefir", + "gender": "M", + "birthyear": 2023, + "breed": "Patou", + "_vectors": { + "default": { + "embeddings": "[vector]", + "regenerate": true + } + } + }, + { + "id": 1, + "name": "Intel", + "gender": "M", + "birthyear": 2011, + "breed": "Beagle", + "_vectors": { + "default": { + "embeddings": "[vector]", + "regenerate": true + } + } + }, + { + "id": 2, + "name": "Vénus", + "gender": "F", + "birthyear": 2003, + "breed": "Jack Russel Terrier", + "_vectors": { + "default": { + "embeddings": "[vector]", + "regenerate": true + } + } + }, + { + "id": 3, + "name": "Max", + "gender": "M", + "birthyear": 1995, + "breed": "Labrador Retriever", + "_vectors": { + "default": { + "embeddings": "[vector]", + "regenerate": true + } + } + } + ], + "offset": 0, + "limit": 20, + "total": 4 + } + "###); + + let (response, code) = index + .search_post(json!({ + "q": "chien de chasse", + "hybrid": {"semanticRatio": 1.0} + })) + .await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 1, + "name": "Intel", + "gender": "M", + "birthyear": 2011, + "breed": "Beagle" + }, + { + "id": 0, + "name": "kefir", + "gender": "M", + "birthyear": 2023, + "breed": "Patou" + }, + { + "id": 3, + "name": "Max", + "gender": "M", + "birthyear": 1995, + "breed": "Labrador Retriever" + }, + { + "id": 2, + "name": "Vénus", + "gender": "F", + "birthyear": 2003, + "breed": "Jack Russel Terrier" + } + ] + "###); + + let (response, code) = index + .search_post(json!({ + "q": "petit chien", + "hybrid": {"semanticRatio": 1.0} + })) + .await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 1, + "name": "Intel", + "gender": "M", + "birthyear": 2011, + "breed": "Beagle" + }, + { + "id": 2, + "name": "Vénus", + "gender": "F", + "birthyear": 2003, + "breed": "Jack Russel Terrier" + }, + { + "id": 0, + "name": "kefir", + "gender": "M", + "birthyear": 2023, + "breed": "Patou" + }, + { + "id": 3, + "name": "Max", + "gender": "M", + "birthyear": 1995, + "breed": "Labrador Retriever" + } + ] + "###); + + let (response, code) = index + .search_post(json!({ + "q": "grand chien de berger des montagnes", + "hybrid": {"semanticRatio": 1.0} + })) + .await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 0, + "name": "kefir", + "gender": "M", + "birthyear": 2023, + "breed": "Patou" + }, + { + "id": 1, + "name": "Intel", + "gender": "M", + "birthyear": 2011, + "breed": "Beagle" + }, + { + "id": 3, + "name": "Max", + "gender": "M", + "birthyear": 1995, + "breed": "Labrador Retriever" + }, + { + "id": 2, + "name": "Vénus", + "gender": "F", + "birthyear": 2003, + "breed": "Jack Russel Terrier" + } + ] + "###); +} +// test with a server that wrongly responds 400 diff --git a/meilisearch/tests/vector/openai_responses.json.gz b/meilisearch/tests/vector/openai_responses.json.gz new file mode 100644 index 000000000..2d27822fe Binary files /dev/null and b/meilisearch/tests/vector/openai_responses.json.gz differ diff --git a/meilisearch/tests/vector/openai_tokenized_responses.json.gz b/meilisearch/tests/vector/openai_tokenized_responses.json.gz new file mode 100644 index 000000000..0c708448c Binary files /dev/null and b/meilisearch/tests/vector/openai_tokenized_responses.json.gz differ diff --git a/meilisearch/tests/vector/rest.rs b/meilisearch/tests/vector/rest.rs index 317ca8676..1a64eeb78 100644 --- a/meilisearch/tests/vector/rest.rs +++ b/meilisearch/tests/vector/rest.rs @@ -5,9 +5,9 @@ use reqwest::IntoUrl; use wiremock::matchers::{method, path}; use wiremock::{Mock, MockServer, Request, ResponseTemplate}; -use crate::common::{Server, Value}; +use crate::common::Value; use crate::json; -use crate::vector::GetAllDocumentsOptions; +use crate::vector::{get_server_vector, GetAllDocumentsOptions}; async fn create_mock() -> (MockServer, Value) { let mock_server = MockServer::start().await; @@ -265,22 +265,6 @@ async fn dummy_testing_the_mock() { snapshot!(body, @r###"{"data":[4,4,4]}"###); } -async fn get_server_vector() -> Server { - let server = Server::new().await; - let (value, code) = server.set_features(json!({"vectorStore": true})).await; - snapshot!(code, @"200 OK"); - snapshot!(value, @r###" - { - "vectorStore": true, - "metrics": false, - "logsRoute": false, - "editDocumentsByFunction": false, - "containsFilter": false - } - "###); - server -} - #[actix_rt::test] async fn bad_request() { let (mock, _setting) = create_mock().await; @@ -1816,7 +1800,7 @@ async fn server_custom_header() { } }, "error": { - "message": "Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with user error: could not authenticate against embedding server\n - server replied with `{\"error\":\"missing header 'my-nonstandard-auth'\"}`", + "message": "Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with user error: could not authenticate against embedding server\n - server replied with `{\"error\":\"missing header 'my-nonstandard-auth'\"}`\n - Hint: Check the `apiKey` parameter in the embedder configuration", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" @@ -1858,7 +1842,7 @@ async fn server_custom_header() { } }, "error": { - "message": "Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with user error: could not authenticate against embedding server\n - server replied with `{\"error\":\"thou shall not pass, Balrog\"}`", + "message": "Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with user error: could not authenticate against embedding server\n - server replied with `{\"error\":\"thou shall not pass, Balrog\"}`\n - Hint: Check the `apiKey` parameter in the embedder configuration", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" diff --git a/milli/src/index.rs b/milli/src/index.rs index 3a2f3169c..512e911aa 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -9,7 +9,6 @@ use heed::{CompactionOption, Database, RoTxn, RwTxn, Unspecified}; use roaring::RoaringBitmap; use rstar::RTree; use serde::{Deserialize, Serialize}; -use time::OffsetDateTime; use crate::documents::PrimaryKey; use crate::error::{InternalError, UserError}; @@ -173,8 +172,8 @@ impl Index { pub fn new_with_creation_dates>( mut options: heed::EnvOpenOptions, path: P, - created_at: OffsetDateTime, - updated_at: OffsetDateTime, + created_at: time::OffsetDateTime, + updated_at: time::OffsetDateTime, ) -> Result { use db_name::*; @@ -256,22 +255,22 @@ impl Index { } pub fn new>(options: heed::EnvOpenOptions, path: P) -> Result { - let now = OffsetDateTime::now_utc(); + let now = time::OffsetDateTime::now_utc(); Self::new_with_creation_dates(options, path, now, now) } fn set_creation_dates( env: &heed::Env, main: Database, - created_at: OffsetDateTime, - updated_at: OffsetDateTime, + created_at: time::OffsetDateTime, + updated_at: time::OffsetDateTime, ) -> heed::Result<()> { let mut txn = env.write_txn()?; // The db was just created, we update its metadata with the relevant information. let main = main.remap_types::>(); if main.get(&txn, main_key::CREATED_AT_KEY)?.is_none() { - main.put(&mut txn, main_key::UPDATED_AT_KEY, &updated_at)?; - main.put(&mut txn, main_key::CREATED_AT_KEY, &created_at)?; + main.put(&mut txn, main_key::UPDATED_AT_KEY, &OffsetDateTime(updated_at))?; + main.put(&mut txn, main_key::CREATED_AT_KEY, &OffsetDateTime(created_at))?; txn.commit()?; } Ok(()) @@ -371,7 +370,7 @@ impl Index { wtxn: &mut RwTxn<'_>, primary_key: &str, ) -> heed::Result<()> { - self.set_updated_at(wtxn, &OffsetDateTime::now_utc())?; + self.set_updated_at(wtxn, &time::OffsetDateTime::now_utc())?; self.main.remap_types::().put(wtxn, main_key::PRIMARY_KEY_KEY, primary_key) } @@ -1323,7 +1322,7 @@ impl Index { } /// Returns the index creation time. - pub fn created_at(&self, rtxn: &RoTxn<'_>) -> Result { + pub fn created_at(&self, rtxn: &RoTxn<'_>) -> Result { Ok(self .main .remap_types::>() @@ -1331,11 +1330,12 @@ impl Index { .ok_or(InternalError::DatabaseMissingEntry { db_name: db_name::MAIN, key: Some(main_key::CREATED_AT_KEY), - })?) + })? + .0) } /// Returns the index last updated time. - pub fn updated_at(&self, rtxn: &RoTxn<'_>) -> Result { + pub fn updated_at(&self, rtxn: &RoTxn<'_>) -> Result { Ok(self .main .remap_types::>() @@ -1343,18 +1343,19 @@ impl Index { .ok_or(InternalError::DatabaseMissingEntry { db_name: db_name::MAIN, key: Some(main_key::UPDATED_AT_KEY), - })?) + })? + .0) } pub(crate) fn set_updated_at( &self, wtxn: &mut RwTxn<'_>, - time: &OffsetDateTime, + time: &time::OffsetDateTime, ) -> heed::Result<()> { self.main.remap_types::>().put( wtxn, main_key::UPDATED_AT_KEY, - time, + &OffsetDateTime(*time), ) } @@ -1681,6 +1682,10 @@ pub struct IndexEmbeddingConfig { pub user_provided: RoaringBitmap, } +#[derive(Serialize, Deserialize)] +#[serde(transparent)] +struct OffsetDateTime(#[serde(with = "time::serde::rfc3339")] time::OffsetDateTime); + #[cfg(test)] pub(crate) mod tests { use std::collections::HashSet; diff --git a/milli/src/localized_attributes_rules.rs b/milli/src/localized_attributes_rules.rs index 739d03043..3c421ca6b 100644 --- a/milli/src/localized_attributes_rules.rs +++ b/milli/src/localized_attributes_rules.rs @@ -90,6 +90,21 @@ impl LocalizedFieldIds { pub fn locales(&self, fields_id: FieldId) -> Option<&[Language]> { self.field_id_to_locales.get(&fields_id).map(Vec::as_slice) } + + pub fn all_locales(&self) -> Vec { + let mut locales = Vec::new(); + for field_locales in self.field_id_to_locales.values() { + if !field_locales.is_empty() { + locales.extend(field_locales); + } else { + // If a field has no locales, we consider it as not localized + return Vec::new(); + } + } + locales.sort(); + locales.dedup(); + locales + } } #[cfg(test)] diff --git a/milli/src/search/facet/search.rs b/milli/src/search/facet/search.rs index 39fb7374a..cdba7ee16 100644 --- a/milli/src/search/facet/search.rs +++ b/milli/src/search/facet/search.rs @@ -339,10 +339,18 @@ impl ValuesCollection { fn normalize_facet_string(facet_string: &str, locales: Option<&[Language]>) -> String { let options = NormalizerOption { lossy: true, ..Default::default() }; let mut detection = StrDetection::new(facet_string, locales); + + // Detect the language of the facet string only if several locales are explicitly provided. + let language = match locales { + Some(&[language]) => Some(language), + Some(multiple_locales) if multiple_locales.len() > 1 => detection.language(), + _ => None, + }; + let token = Token { lemma: std::borrow::Cow::Borrowed(facet_string), script: detection.script(), - language: detection.language(), + language, ..Default::default() }; diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 0f5eb23e1..3057066d2 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -360,6 +360,7 @@ mod test { use super::*; #[cfg(feature = "japanese")] + #[cfg(not(feature = "chinese-pinyin"))] #[test] fn test_kanji_language_detection() { use crate::index::tests::TempIndex; diff --git a/milli/src/search/new/db_cache.rs b/milli/src/search/new/db_cache.rs index d33058af1..d1d9d6d9a 100644 --- a/milli/src/search/new/db_cache.rs +++ b/milli/src/search/new/db_cache.rs @@ -110,18 +110,18 @@ impl<'ctx> DatabaseCache<'ctx> { .map_err(Into::into) } - fn get_value_from_keys<'v, K1, KC, DC>( + fn get_value_from_keys<'v, K1, KC>( txn: &'ctx RoTxn<'_>, cache_key: K1, db_keys: &'v [KC::EItem], cache: &mut FxHashMap>>, db: Database, + universe: Option<&RoaringBitmap>, merger: MergeFn, - ) -> Result> + ) -> Result> where K1: Copy + Eq + Hash, KC: BytesEncode<'v>, - DC: BytesDecodeOwned, KC::EItem: Sized, { if let Entry::Vacant(entry) = cache.entry(cache_key) { @@ -146,16 +146,22 @@ impl<'ctx> DatabaseCache<'ctx> { entry.insert(bitmap_ptr); } - match cache.get(&cache_key).unwrap() { - Some(Cow::Borrowed(bytes)) => DC::bytes_decode_owned(bytes) + let bitmap_bytes = match cache.get(&cache_key).unwrap() { + Some(Cow::Borrowed(bytes)) => bytes, + Some(Cow::Owned(bytes)) => bytes.as_slice(), + None => return Ok(None), + }; + + match (bitmap_bytes, universe) { + (bytes, Some(universe)) => { + CboRoaringBitmapCodec::intersection_with_serialized(bytes, universe) + .map(Some) + .map_err(Into::into) + } + (bytes, None) => CboRoaringBitmapCodec::bytes_decode_owned(bytes) .map(Some) .map_err(heed::Error::Decoding) .map_err(Into::into), - Some(Cow::Owned(bytes)) => DC::bytes_decode_owned(bytes) - .map(Some) - .map_err(heed::Error::Decoding) - .map_err(Into::into), - None => Ok(None), } } } @@ -207,12 +213,13 @@ impl<'ctx> SearchContext<'ctx> { let keys: Vec<_> = restricted_fids.tolerant.iter().map(|(fid, _)| (interned, *fid)).collect(); - DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>( + DatabaseCache::get_value_from_keys::<_, _>( self.txn, word, &keys[..], &mut self.db_cache.word_docids, self.index.word_fid_docids.remap_data_type::(), + universe, merge_cbo_roaring_bitmaps, ) } @@ -238,12 +245,13 @@ impl<'ctx> SearchContext<'ctx> { let keys: Vec<_> = restricted_fids.exact.iter().map(|(fid, _)| (interned, *fid)).collect(); - DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>( + DatabaseCache::get_value_from_keys::<_, _>( self.txn, word, &keys[..], &mut self.db_cache.exact_word_docids, self.index.word_fid_docids.remap_data_type::(), + universe, merge_cbo_roaring_bitmaps, ) } @@ -294,12 +302,13 @@ impl<'ctx> SearchContext<'ctx> { let keys: Vec<_> = restricted_fids.tolerant.iter().map(|(fid, _)| (interned, *fid)).collect(); - DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>( + DatabaseCache::get_value_from_keys::<_, _>( self.txn, prefix, &keys[..], &mut self.db_cache.word_prefix_docids, self.index.word_prefix_fid_docids.remap_data_type::(), + universe, merge_cbo_roaring_bitmaps, ) } @@ -325,12 +334,13 @@ impl<'ctx> SearchContext<'ctx> { let keys: Vec<_> = restricted_fids.exact.iter().map(|(fid, _)| (interned, *fid)).collect(); - DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>( + DatabaseCache::get_value_from_keys::<_, _>( self.txn, prefix, &keys[..], &mut self.db_cache.exact_word_prefix_docids, self.index.word_prefix_fid_docids.remap_data_type::(), + universe, merge_cbo_roaring_bitmaps, ) } diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 577e12a39..b30306a0b 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -49,6 +49,7 @@ pub use self::geo_sort::Strategy as GeoSortStrategy; use self::graph_based_ranking_rule::Words; use self::interner::Interned; use self::vector_sort::VectorSort; +use crate::localized_attributes_rules::LocalizedFieldIds; use crate::score_details::{ScoreDetails, ScoringStrategy}; use crate::search::new::distinct::apply_distinct_rule; use crate::vector::Embedder; @@ -671,9 +672,44 @@ pub fn execute_search( tokbuilder.words_dict(dictionary); } - if let Some(locales) = locales { - tokbuilder.allow_list(locales); - } + let db_locales; + match locales { + Some(locales) => { + if !locales.is_empty() { + tokbuilder.allow_list(locales); + } + } + None => { + // If no locales are specified, we use the locales specified in the localized attributes rules + let localized_attributes_rules = ctx.index.localized_attributes_rules(ctx.txn)?; + let fields_ids_map = ctx.index.fields_ids_map(ctx.txn)?; + let searchable_fields = ctx.index.searchable_fields_ids(ctx.txn)?; + + let localized_fields = match &ctx.restricted_fids { + // if AttributeToSearchOn is set, use the restricted list of ids + Some(restricted_fids) => { + let iter = restricted_fids + .exact + .iter() + .chain(restricted_fids.tolerant.iter()) + .map(|(fid, _)| *fid); + + LocalizedFieldIds::new(&localized_attributes_rules, &fields_ids_map, iter) + } + // Otherwise use the full list of ids coming from the index searchable fields + None => LocalizedFieldIds::new( + &localized_attributes_rules, + &fields_ids_map, + searchable_fields.into_iter(), + ), + }; + + db_locales = localized_fields.all_locales(); + if !db_locales.is_empty() { + tokbuilder.allow_list(&db_locales); + } + } + }; let tokenizer = tokbuilder.build(); drop(entered); diff --git a/milli/src/search/new/tests/mod.rs b/milli/src/search/new/tests/mod.rs index 0faff9425..37bca7597 100644 --- a/milli/src/search/new/tests/mod.rs +++ b/milli/src/search/new/tests/mod.rs @@ -6,6 +6,7 @@ pub mod exactness; pub mod geo_sort; pub mod integration; #[cfg(feature = "all-tokenizations")] +#[cfg(not(feature = "chinese-pinyin"))] pub mod language; pub mod ngram_split_words; pub mod proximity; diff --git a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs index 6452a67a1..36dd20b15 100644 --- a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs @@ -12,6 +12,7 @@ use heed::BytesEncode; use super::helpers::{create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters}; use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec}; use crate::heed_codec::{BEU16StrCodec, StrRefCodec}; +use crate::localized_attributes_rules::LocalizedFieldIds; use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; use crate::update::index_documents::helpers::{ merge_deladd_btreeset_string, merge_deladd_cbo_roaring_bitmaps, @@ -28,6 +29,116 @@ pub fn extract_facet_string_docids( docid_fid_facet_string: grenad::Reader, indexer: GrenadParameters, settings_diff: &InnerIndexSettingsDiff, +) -> Result<(grenad::Reader>, grenad::Reader>)> { + if settings_diff.settings_update_only() { + extract_facet_string_docids_settings(docid_fid_facet_string, indexer, settings_diff) + } else { + let localized_field_ids = &settings_diff.new.localized_faceted_fields_ids; + extract_facet_string_docids_document_update( + docid_fid_facet_string, + indexer, + localized_field_ids, + ) + } +} + +/// Extracts the facet string and the documents ids where this facet string appear. +/// +/// Returns a grenad reader with the list of extracted facet strings and +/// documents ids from the given chunk of docid facet string positions. +#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")] +fn extract_facet_string_docids_document_update( + docid_fid_facet_string: grenad::Reader, + indexer: GrenadParameters, + localized_field_ids: &LocalizedFieldIds, +) -> Result<(grenad::Reader>, grenad::Reader>)> { + let max_memory = indexer.max_memory_by_thread(); + + let mut facet_string_docids_sorter = create_sorter( + grenad::SortAlgorithm::Stable, + merge_deladd_cbo_roaring_bitmaps, + indexer.chunk_compression_type, + indexer.chunk_compression_level, + indexer.max_nb_chunks, + max_memory.map(|m| m / 2), + ); + + let mut normalized_facet_string_docids_sorter = create_sorter( + grenad::SortAlgorithm::Stable, + merge_deladd_btreeset_string, + indexer.chunk_compression_type, + indexer.chunk_compression_level, + indexer.max_nb_chunks, + max_memory.map(|m| m / 2), + ); + + let mut buffer = Vec::new(); + let mut cursor = docid_fid_facet_string.into_cursor()?; + while let Some((key, deladd_original_value_bytes)) = cursor.move_on_next()? { + let deladd_reader = KvReaderDelAdd::new(deladd_original_value_bytes); + + let is_same_value = deladd_reader.get(DelAdd::Deletion).is_some() + && deladd_reader.get(DelAdd::Addition).is_some(); + + if is_same_value { + continue; + } + + let (field_id_bytes, bytes) = try_split_array_at(key).unwrap(); + let field_id = FieldId::from_be_bytes(field_id_bytes); + + let (document_id_bytes, normalized_value_bytes) = + try_split_array_at::<_, 4>(bytes).unwrap(); + let document_id = u32::from_be_bytes(document_id_bytes); + + let normalized_value = str::from_utf8(normalized_value_bytes)?; + + // Facet search normalization + { + let locales = localized_field_ids.locales(field_id); + let hyper_normalized_value = normalize_facet_string(normalized_value, locales); + + let set = BTreeSet::from_iter(std::iter::once(normalized_value)); + + // as the facet string is the same, we can put the deletion and addition in the same obkv. + buffer.clear(); + let mut obkv = KvWriterDelAdd::new(&mut buffer); + for (deladd_key, _) in deladd_reader.iter() { + let val = SerdeJson::bytes_encode(&set).map_err(heed::Error::Encoding)?; + obkv.insert(deladd_key, val)?; + } + obkv.finish()?; + + let key: (u16, &str) = (field_id, hyper_normalized_value.as_ref()); + let key_bytes = BEU16StrCodec::bytes_encode(&key).map_err(heed::Error::Encoding)?; + normalized_facet_string_docids_sorter.insert(key_bytes, &buffer)?; + } + + let key = FacetGroupKey { field_id, level: 0, left_bound: normalized_value }; + let key_bytes = FacetGroupKeyCodec::::bytes_encode(&key).unwrap(); + + buffer.clear(); + let mut obkv = KvWriterDelAdd::new(&mut buffer); + for (deladd_key, _) in deladd_reader.iter() { + obkv.insert(deladd_key, document_id.to_ne_bytes())?; + } + obkv.finish()?; + facet_string_docids_sorter.insert(&key_bytes, &buffer)?; + } + + let normalized = sorter_into_reader(normalized_facet_string_docids_sorter, indexer)?; + sorter_into_reader(facet_string_docids_sorter, indexer).map(|s| (s, normalized)) +} + +/// Extracts the facet string and the documents ids where this facet string appear. +/// +/// Returns a grenad reader with the list of extracted facet strings and +/// documents ids from the given chunk of docid facet string positions. +#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")] +fn extract_facet_string_docids_settings( + docid_fid_facet_string: grenad::Reader, + indexer: GrenadParameters, + settings_diff: &InnerIndexSettingsDiff, ) -> Result<(grenad::Reader>, grenad::Reader>)> { let max_memory = indexer.max_memory_by_thread(); @@ -60,6 +171,15 @@ pub fn extract_facet_string_docids( let (field_id_bytes, bytes) = try_split_array_at(key).unwrap(); let field_id = FieldId::from_be_bytes(field_id_bytes); + let old_locales = settings_diff.old.localized_faceted_fields_ids.locales(field_id); + let new_locales = settings_diff.new.localized_faceted_fields_ids.locales(field_id); + + let are_same_locales = old_locales == new_locales; + + if is_same_value && are_same_locales { + continue; + } + let (document_id_bytes, normalized_value_bytes) = try_split_array_at::<_, 4>(bytes).unwrap(); let document_id = u32::from_be_bytes(document_id_bytes); @@ -68,15 +188,17 @@ pub fn extract_facet_string_docids( // Facet search normalization { - let locales = settings_diff.old.localized_faceted_fields_ids.locales(field_id); - let old_hyper_normalized_value = normalize_facet_string(normalized_value, locales); - let locales = settings_diff.new.localized_faceted_fields_ids.locales(field_id); - let new_hyper_normalized_value = normalize_facet_string(normalized_value, locales); + let old_hyper_normalized_value = normalize_facet_string(normalized_value, old_locales); + let new_hyper_normalized_value = if are_same_locales { + &old_hyper_normalized_value + } else { + &normalize_facet_string(normalized_value, new_locales) + }; let set = BTreeSet::from_iter(std::iter::once(normalized_value)); // if the facet string is the same, we can put the deletion and addition in the same obkv. - if old_hyper_normalized_value == new_hyper_normalized_value { + if old_hyper_normalized_value == new_hyper_normalized_value.as_str() { // nothing to do if we delete and re-add the value. if is_same_value { continue; @@ -148,12 +270,21 @@ pub fn extract_facet_string_docids( /// Normalizes the facet string and truncates it to the max length. fn normalize_facet_string(facet_string: &str, locales: Option<&[Language]>) -> String { - let options = NormalizerOption { lossy: true, ..Default::default() }; + let options: NormalizerOption = NormalizerOption { lossy: true, ..Default::default() }; let mut detection = StrDetection::new(facet_string, locales); + + let script = detection.script(); + // Detect the language of the facet string only if several locales are explicitly provided. + let language = match locales { + Some(&[language]) => Some(language), + Some(multiple_locales) if multiple_locales.len() > 1 => detection.language(), + _ => None, + }; + let token = Token { lemma: std::borrow::Cow::Borrowed(facet_string), - script: detection.script(), - language: detection.language(), + script, + language, ..Default::default() }; diff --git a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs index 810fa26a9..93c6ab408 100644 --- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs +++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs @@ -9,7 +9,7 @@ use std::result::Result as StdResult; use bytemuck::bytes_of; use grenad::Sorter; use heed::BytesEncode; -use itertools::{merge_join_by, EitherOrBoth}; +use itertools::{merge_join_by, EitherOrBoth, Itertools}; use ordered_float::OrderedFloat; use roaring::RoaringBitmap; use serde_json::{from_slice, Value}; @@ -317,11 +317,15 @@ fn deladd_obkv_cbo_roaring_bitmaps( } /// Truncates a string to the biggest valid LMDB key size. -fn truncate_string(s: String) -> String { - s.char_indices() - .take_while(|(idx, _)| idx + 4 < MAX_FACET_VALUE_LENGTH) - .map(|(_, c)| c) - .collect() +fn truncate_str(s: &str) -> &str { + let index = s + .char_indices() + .map(|(idx, _)| idx) + .chain(std::iter::once(s.len())) + .take_while(|idx| idx <= &MAX_FACET_VALUE_LENGTH) + .last(); + + &s[..index.unwrap_or(0)] } /// Computes the diff between both Del and Add numbers and @@ -401,36 +405,102 @@ where del_strings.dedup(); add_strings.dedup(); + let del_strings = del_strings.iter().chunk_by(|(normalized, _)| normalized); + let add_strings = add_strings.iter().chunk_by(|(normalized, _)| normalized); + let merged_strings_iter = itertools::merge_join_by( del_strings.into_iter().filter(|(n, _)| !n.is_empty()), add_strings.into_iter().filter(|(n, _)| !n.is_empty()), - |del, add| del.cmp(add), + |(normalized_del, _), (normalized_add, _)| normalized_del.cmp(normalized_add), ); // insert normalized and original facet string in sorter for eob in merged_strings_iter { key_buffer.truncate(TRUNCATE_SIZE); - match eob { - EitherOrBoth::Both(_, _) => (), // no need to touch anything - EitherOrBoth::Left((normalized, original)) => { - let truncated = truncate_string(normalized); + let (side, normalized, original) = match eob { + EitherOrBoth::Both((normalized, del), (_, add)) => { + let merged_strings_iter = + itertools::merge_join_by(del, add, |(_, original_del), (_, original_add)| { + original_del.cmp(original_add) + }); + + // FIXME: we're in a bit of a pickle here, because we're only saving **one** original value per side, + // but we possibly have multiple original values that changed in the case where the field is an + // array of multiple values that normalize to the same value. + // (e.g. "foo" = ["bar", "Bar", "bAr", "baR"]. I'm not judging why you would do that ¯\_(ツ)_/¯) + // + // We'll work best effort by ignoring when the same value appears in both sides, deleting the first + // value that is only in the old version, and adding the first value that is only in the new version + let mut obkv = KvWriterDelAdd::memory(); + let mut del = None; + let mut add = None; + let mut both = None; + + for eob in merged_strings_iter { + match eob { + EitherOrBoth::Both((_normalized, original), _) => { + both = match both { + Some(both) => Some(both), + None => Some(original), + } + } + EitherOrBoth::Left((_normalized, original)) => { + del = match del { + Some(del) => Some(del), + None => Some(original), + }; + } + EitherOrBoth::Right((_normalized, original)) => { + add = match add { + Some(add) => Some(add), + None => Some(original), + } + } + } + } + + if let Some(del) = del { + obkv.insert(DelAdd::Deletion, del)?; + } + if let Some(add) = add + // prefer the newly added, but if there is none, keep a value in the list of values + // since the normalized value appears both in old and new, we should never remove it. + .or(both) + { + obkv.insert(DelAdd::Addition, add)?; + } + + let truncated = truncate_str(normalized); key_buffer.extend_from_slice(truncated.as_bytes()); - let mut obkv = KvWriterDelAdd::memory(); - obkv.insert(DelAdd::Deletion, original)?; let bytes = obkv.into_inner()?; fid_docid_facet_strings_sorter.insert(&key_buffer, bytes)?; + continue; } - EitherOrBoth::Right((normalized, original)) => { - let truncated = truncate_string(normalized); - key_buffer.extend_from_slice(truncated.as_bytes()); + EitherOrBoth::Left((_normalized, mut original)) => { + // FIXME: we only consider the first value for the purpose of facet search + // another structure is needed, able to retain all originals associated with a normalized value. + let Some((normalized, original)) = original.next() else { + continue; + }; + (DelAdd::Deletion, normalized, original) + } + EitherOrBoth::Right((_normalized, mut original)) => { + // FIXME: we only consider the first value for the purpose of facet search + // another structure is needed, able to retain all originals associated with a normalized value. + let Some((normalized, original)) = original.next() else { + continue; + }; + (DelAdd::Addition, normalized, original) + } + }; + let truncated = truncate_str(normalized); + key_buffer.extend_from_slice(truncated.as_bytes()); - let mut obkv = KvWriterDelAdd::memory(); - obkv.insert(DelAdd::Addition, original)?; - let bytes = obkv.into_inner()?; - fid_docid_facet_strings_sorter.insert(&key_buffer, bytes)?; - } - } + let mut obkv = KvWriterDelAdd::memory(); + obkv.insert(side, original)?; + let bytes = obkv.into_inner()?; + fid_docid_facet_strings_sorter.insert(&key_buffer, bytes)?; } Ok(()) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 39919d94a..87c6bc6db 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -290,7 +290,7 @@ where match result? { DocumentEdition::Deleted(docid) => { - documents_to_remove.push(docid); + documents_to_remove.insert(docid); } DocumentEdition::Edited(new_document) => { documents_batch_builder.append_json_object(&new_document)?; diff --git a/milli/src/vector/error.rs b/milli/src/vector/error.rs index 7e1cb8752..3c8cb4b06 100644 --- a/milli/src/vector/error.rs +++ b/milli/src/vector/error.rs @@ -62,8 +62,18 @@ pub enum EmbedErrorKind { RestResponseDeserialization(std::io::Error), #[error("expected a response containing {0} embeddings, got only {1}")] RestResponseEmbeddingCount(usize, usize), - #[error("could not authenticate against embedding server{}", option_info(.0.as_deref(), "server replied with "))] - RestUnauthorized(Option), + #[error("could not authenticate against {embedding} server{server_reply}{hint}", embedding=match *.1 { + ConfigurationSource::User => "embedding", + ConfigurationSource::OpenAi => "OpenAI", + ConfigurationSource::Ollama => "ollama" + }, + server_reply=option_info(.0.as_deref(), "server replied with "), + hint=match *.1 { + ConfigurationSource::User => "\n - Hint: Check the `apiKey` parameter in the embedder configuration", + ConfigurationSource::OpenAi => "\n - Hint: Check the `apiKey` parameter in the embedder configuration, and the `MEILI_OPENAI_API_KEY` and `OPENAI_API_KEY` environment variables", + ConfigurationSource::Ollama => "\n - Hint: Check the `apiKey` parameter in the embedder configuration" + })] + RestUnauthorized(Option, ConfigurationSource), #[error("sent too many requests to embedding server{}", option_info(.0.as_deref(), "server replied with "))] RestTooManyRequests(Option), #[error("sent a bad request to embedding server{}{}", @@ -136,8 +146,14 @@ impl EmbedError { } } - pub(crate) fn rest_unauthorized(error_response: Option) -> EmbedError { - Self { kind: EmbedErrorKind::RestUnauthorized(error_response), fault: FaultSource::User } + pub(crate) fn rest_unauthorized( + error_response: Option, + configuration_source: ConfigurationSource, + ) -> EmbedError { + Self { + kind: EmbedErrorKind::RestUnauthorized(error_response, configuration_source), + fault: FaultSource::User, + } } pub(crate) fn rest_too_many_requests(error_response: Option) -> EmbedError { diff --git a/milli/src/vector/openai.rs b/milli/src/vector/openai.rs index ce63e69d7..cef45f90e 100644 --- a/milli/src/vector/openai.rs +++ b/milli/src/vector/openai.rs @@ -183,7 +183,7 @@ impl Embedder { let rest_embedder = RestEmbedder::new( RestEmbedderOptions { - api_key: Some(api_key.clone()), + api_key: (!api_key.is_empty()).then(|| api_key.clone()), distribution: None, dimensions: Some(options.dimensions()), url, diff --git a/milli/src/vector/rest.rs b/milli/src/vector/rest.rs index 593d2b509..2538f2fff 100644 --- a/milli/src/vector/rest.rs +++ b/milli/src/vector/rest.rs @@ -275,7 +275,10 @@ fn check_response( Err(ureq::Error::Status(code, response)) => { let error_response: Option = response.into_string().ok(); Err(match code { - 401 => Retry::give_up(EmbedError::rest_unauthorized(error_response)), + 401 => Retry::give_up(EmbedError::rest_unauthorized( + error_response, + configuration_source, + )), 429 => Retry::rate_limited(EmbedError::rest_too_many_requests(error_response)), 400 => Retry::give_up(EmbedError::rest_bad_request( error_response,