Merge #4888

4888: bring back v1.10.0 into main r=Kerollmops a=ManyTheFish Co-authored-by: Louis Dureuil <louis@meilisearch.com> Co-authored-by: meili-bors[bot] <89034592+meili-bors[bot]@users.noreply.github.com> Co-authored-by: Tamo <tamo@meilisearch.com> Co-authored-by: ManyTheFish <many@meilisearch.com>
2024-11-22 18:17:39 +08:00 · 2024-08-27 14:02:08 +00:00 · 2024-08-27 14:02:08 +00:00 · 9a756cf2c5
commit 9a756cf2c5
parent 321639364f 36d8684dc8
27 changed files with 2618 additions and 99 deletions
--- a/index-scheduler/src/index_mapper/mod.rs
+++ b/index-scheduler/src/index_mapper/mod.rs
@ -108,8 +108,10 @@ pub struct IndexStats {
    /// Association of every field name with the number of times it occurs in the documents.
    pub field_distribution: FieldDistribution,
    /// Creation date of the index.
+    #[serde(with = "time::serde::rfc3339")]
    pub created_at: OffsetDateTime,
    /// Date of the last update of the index.
+    #[serde(with = "time::serde::rfc3339")]
    pub updated_at: OffsetDateTime,
 }

--- a/meilisearch/src/main.rs
+++ b/meilisearch/src/main.rs
@ -72,6 +72,19 @@ fn on_panic(info: &std::panic::PanicInfo) {

 #[actix_web::main]
 async fn main() -> anyhow::Result<()> {
+    try_main().await.inspect_err(|error| {
+        tracing::error!(%error);
+        let mut current = error.source();
+        let mut depth = 0;
+        while let Some(source) = current {
+            tracing::info!(%source, depth, "Error caused by");
+            current = source.source();
+            depth += 1;
+        }
+    })
+}
+
+async fn try_main() -> anyhow::Result<()> {
    let (opt, config_read_from) = Opt::try_build()?;

    std::panic::set_hook(Box::new(on_panic));
--- a/meilisearch/src/routes/indexes/settings.rs
+++ b/meilisearch/src/routes/indexes/settings.rs
@ -682,6 +682,7 @@ generate_configure!(
    filterable_attributes,
    sortable_attributes,
    displayed_attributes,
+    localized_attributes,
    searchable_attributes,
    distinct_attribute,
    proximity_precision,
--- a/meilisearch/src/search/mod.rs
+++ b/meilisearch/src/search/mod.rs
@ -1369,12 +1369,18 @@ pub fn perform_facet_search(
        None => TimeBudget::default(),
    };

+    // In the faceted search context, we want to use the intersection between the locales provided by the user
+    // and the locales of the facet string.
+    // If the facet string is not localized, we **ignore** the locales provided by the user because the facet data has no locale.
+    // If the user does not provide locales, we use the locales of the facet string.
    let localized_attributes = index.localized_attributes_rules(&rtxn)?.unwrap_or_default();
-    let locales = locales.or_else(|| {
-        localized_attributes
+    let localized_attributes_locales =
+        localized_attributes.into_iter().find(|attr| attr.match_str(&facet_name));
+    let locales = localized_attributes_locales.map(|attr| {
+        attr.locales
            .into_iter()
-            .find(|attr| attr.match_str(&facet_name))
-            .map(|attr| attr.locales)
+            .filter(|locale| locales.as_ref().map_or(true, |locales| locales.contains(locale)))
+            .collect()
    });

    let (search, _, _, _) =
--- a/meilisearch/tests/search/locales.rs
+++ b/meilisearch/tests/search/locales.rs
@ -386,12 +386,39 @@ async fn force_locales() {
            |response, code| {
                snapshot!(response, @r###"
                {
-                  "hits": [],
+                  "hits": [
+                    {
+                      "name_zh": "进击的巨人",
+                      "author_zh": "諫山創",
+                      "description_zh": "进击的巨人是日本的漫画系列，由諫山 創作画。",
+                      "id": 853,
+                      "_vectors": {
+                        "manual": [
+                          1.0,
+                          2.0,
+                          3.0
+                        ]
+                      },
+                      "_formatted": {
+                        "name_zh": "<em>进</em><em>击</em><em>的</em><em>巨人</em>",
+                        "author_zh": "諫山創",
+                        "description_zh": "<em>进</em><em>击</em><em>的</em><em>巨人</em>是日本的漫画系列，由諫山 創作画。",
+                        "id": "853",
+                        "_vectors": {
+                          "manual": [
+                            "1.0",
+                            "2.0",
+                            "3.0"
+                          ]
+                        }
+                      }
+                    }
+                  ],
                  "query": "\"进击的巨人\"",
                  "processingTimeMs": "[duration]",
                  "limit": 20,
                  "offset": 0,
-                  "estimatedTotalHits": 0
+                  "estimatedTotalHits": 1
                }
                "###);
                snapshot!(code, @"200 OK");
@ -483,12 +510,39 @@ async fn force_locales_with_pattern() {
            |response, code| {
                snapshot!(response, @r###"
                {
-                  "hits": [],
+                  "hits": [
+                    {
+                      "name_zh": "进击的巨人",
+                      "author_zh": "諫山創",
+                      "description_zh": "进击的巨人是日本的漫画系列，由諫山 創作画。",
+                      "id": 853,
+                      "_vectors": {
+                        "manual": [
+                          1.0,
+                          2.0,
+                          3.0
+                        ]
+                      },
+                      "_formatted": {
+                        "name_zh": "<em>进</em><em>击</em><em>的</em><em>巨人</em>",
+                        "author_zh": "諫山創",
+                        "description_zh": "<em>进</em><em>击</em><em>的</em><em>巨人</em>是日本的漫画系列，由諫山 創作画。",
+                        "id": "853",
+                        "_vectors": {
+                          "manual": [
+                            "1.0",
+                            "2.0",
+                            "3.0"
+                          ]
+                        }
+                      }
+                    }
+                  ],
                  "query": "\"进击的巨人\"",
                  "processingTimeMs": "[duration]",
                  "limit": 20,
                  "offset": 0,
-                  "estimatedTotalHits": 0
+                  "estimatedTotalHits": 1
                }
                "###);
                snapshot!(code, @"200 OK");
@ -761,6 +815,275 @@ async fn force_different_locales_with_pattern() {
        .await;
 }

+#[actix_rt::test]
+async fn auto_infer_locales_at_search_with_attributes_to_search_on() {
+    let server = Server::new().await;
+
+    let index = server.index("test");
+    let documents = DOCUMENTS.clone();
+    let (response, _) = index
+        .update_settings(
+            json!({
+                "searchableAttributes": ["name_en", "name_ja", "name_zh", "author_en", "author_ja", "author_zh", "description_en", "description_ja", "description_zh"],
+                "localizedAttributes": [
+                    // force japanese
+                    {"attributePatterns": ["*_zh"], "locales": ["jpn"]},
+                    // force chinese
+                    {"attributePatterns": ["*_ja"], "locales": ["cmn"]},
+                    // any language
+                    {"attributePatterns": ["*_en"], "locales": []}
+                ]
+            }),
+        )
+        .await;
+    snapshot!(response, @r###"
+    {
+      "taskUid": 0,
+      "indexUid": "test",
+      "status": "enqueued",
+      "type": "settingsUpdate",
+      "enqueuedAt": "[date]"
+    }
+    "###);
+    index.add_documents(documents, None).await;
+    index.wait_task(1).await;
+
+    // auto infer any language
+    index
+        .search(
+            json!({"q": "\"进击的巨人\"", "attributesToHighlight": ["*"]}),
+            |response, code| {
+                snapshot!(response, @r###"
+                {
+                  "hits": [],
+                  "query": "\"进击的巨人\"",
+                  "processingTimeMs": "[duration]",
+                  "limit": 20,
+                  "offset": 0,
+                  "estimatedTotalHits": 0
+                }
+                "###);
+                snapshot!(code, @"200 OK");
+            },
+        )
+        .await;
+
+    // should infer chinese
+    index
+            .search(
+                json!({"q": "\"进击的巨人\"", "attributesToHighlight": ["*"], "attributesToSearchOn": ["name_zh", "description_zh"]}),
+                |response, code| {
+                    snapshot!(response, @r###"
+                    {
+                      "hits": [
+                        {
+                          "name_zh": "进击的巨人",
+                          "author_zh": "諫山創",
+                          "description_zh": "进击的巨人是日本的漫画系列，由諫山 創作画。",
+                          "id": 853,
+                          "_vectors": {
+                            "manual": [
+                              1.0,
+                              2.0,
+                              3.0
+                            ]
+                          },
+                          "_formatted": {
+                            "name_zh": "<em>进</em><em>击</em><em>的</em><em>巨人</em>",
+                            "author_zh": "諫山創",
+                            "description_zh": "<em>进</em><em>击</em><em>的</em><em>巨人</em>是日本的漫画系列，由諫山 創作画。",
+                            "id": "853",
+                            "_vectors": {
+                              "manual": [
+                                "1.0",
+                                "2.0",
+                                "3.0"
+                              ]
+                            }
+                          }
+                        }
+                      ],
+                      "query": "\"进击的巨人\"",
+                      "processingTimeMs": "[duration]",
+                      "limit": 20,
+                      "offset": 0,
+                      "estimatedTotalHits": 1
+                    }
+                    "###);
+                    snapshot!(code, @"200 OK");
+                },
+            )
+            .await;
+}
+
+#[actix_rt::test]
+async fn auto_infer_locales_at_search() {
+    let server = Server::new().await;
+
+    let index = server.index("test");
+    let documents = DOCUMENTS.clone();
+    let (response, _) = index
+        .update_settings(
+            json!({
+                "searchableAttributes": ["name_en", "name_ja", "name_zh", "author_en", "author_ja", "author_zh", "description_en", "description_ja", "description_zh"],
+                "localizedAttributes": [
+                    // force japanese
+                    {"attributePatterns": ["*"], "locales": ["jpn"]},
+                ]
+            }),
+        )
+        .await;
+    snapshot!(response, @r###"
+    {
+      "taskUid": 0,
+      "indexUid": "test",
+      "status": "enqueued",
+      "type": "settingsUpdate",
+      "enqueuedAt": "[date]"
+    }
+    "###);
+    index.add_documents(documents, None).await;
+    index.wait_task(1).await;
+
+    index
+        .search(
+            json!({"q": "\"进击的巨人\"", "attributesToHighlight": ["*"]}),
+            |response, code| {
+                snapshot!(response, @r###"
+                {
+                  "hits": [
+                    {
+                      "name_zh": "进击的巨人",
+                      "author_zh": "諫山創",
+                      "description_zh": "进击的巨人是日本的漫画系列，由諫山 創作画。",
+                      "id": 853,
+                      "_vectors": {
+                        "manual": [
+                          1.0,
+                          2.0,
+                          3.0
+                        ]
+                      },
+                      "_formatted": {
+                        "name_zh": "<em>进</em><em>击</em><em>的</em><em>巨人</em>",
+                        "author_zh": "諫山創",
+                        "description_zh": "<em>进</em><em>击</em><em>的</em><em>巨人</em>是日本的漫画系列，由諫山 創作画。",
+                        "id": "853",
+                        "_vectors": {
+                          "manual": [
+                            "1.0",
+                            "2.0",
+                            "3.0"
+                          ]
+                        }
+                      }
+                    }
+                  ],
+                  "query": "\"进击的巨人\"",
+                  "processingTimeMs": "[duration]",
+                  "limit": 20,
+                  "offset": 0,
+                  "estimatedTotalHits": 1
+                }
+                "###);
+                snapshot!(code, @"200 OK");
+            },
+        )
+        .await;
+
+    index
+            .search(
+                json!({"q": "\"进击的巨人\"", "attributesToHighlight": ["*"]}),
+                |response, code| {
+                    snapshot!(response, @r###"
+                    {
+                      "hits": [
+                        {
+                          "name_zh": "进击的巨人",
+                          "author_zh": "諫山創",
+                          "description_zh": "进击的巨人是日本的漫画系列，由諫山 創作画。",
+                          "id": 853,
+                          "_vectors": {
+                            "manual": [
+                              1.0,
+                              2.0,
+                              3.0
+                            ]
+                          },
+                          "_formatted": {
+                            "name_zh": "<em>进</em><em>击</em><em>的</em><em>巨人</em>",
+                            "author_zh": "諫山創",
+                            "description_zh": "<em>进</em><em>击</em><em>的</em><em>巨人</em>是日本的漫画系列，由諫山 創作画。",
+                            "id": "853",
+                            "_vectors": {
+                              "manual": [
+                                "1.0",
+                                "2.0",
+                                "3.0"
+                              ]
+                            }
+                          }
+                        }
+                      ],
+                      "query": "\"进击的巨人\"",
+                      "processingTimeMs": "[duration]",
+                      "limit": 20,
+                      "offset": 0,
+                      "estimatedTotalHits": 1
+                    }
+                    "###);
+                    snapshot!(code, @"200 OK");
+                },
+            )
+            .await;
+
+    index
+        .search(
+            json!({"q": "\"进击的巨人\"", "attributesToHighlight": ["*"]}),
+            |response, code| {
+                snapshot!(response, @r###"
+                {
+                  "hits": [
+                    {
+                      "name_zh": "进击的巨人",
+                      "author_zh": "諫山創",
+                      "description_zh": "进击的巨人是日本的漫画系列，由諫山 創作画。",
+                      "id": 853,
+                      "_vectors": {
+                        "manual": [
+                          1.0,
+                          2.0,
+                          3.0
+                        ]
+                      },
+                      "_formatted": {
+                        "name_zh": "<em>进</em><em>击</em><em>的</em><em>巨人</em>",
+                        "author_zh": "諫山創",
+                        "description_zh": "<em>进</em><em>击</em><em>的</em><em>巨人</em>是日本的漫画系列，由諫山 創作画。",
+                        "id": "853",
+                        "_vectors": {
+                          "manual": [
+                            "1.0",
+                            "2.0",
+                            "3.0"
+                          ]
+                        }
+                      }
+                    }
+                  ],
+                  "query": "\"进击的巨人\"",
+                  "processingTimeMs": "[duration]",
+                  "limit": 20,
+                  "offset": 0,
+                  "estimatedTotalHits": 1
+                }
+                "###);
+                snapshot!(code, @"200 OK");
+            },
+        )
+        .await;
+}
+
 #[actix_rt::test]
 async fn force_different_locales_with_pattern_nested() {
    let server = Server::new().await;
--- a/meilisearch/tests/search/mod.rs
+++ b/meilisearch/tests/search/mod.rs
@ -7,6 +7,7 @@ mod facet_search;
 mod formatted;
 mod geo;
 mod hybrid;
+#[cfg(not(feature = "chinese-pinyin"))]
 mod locales;
 mod matching_strategy;
 mod multi;
@ -169,6 +170,7 @@ async fn negative_special_cases_search() {
 }

 #[cfg(feature = "default")]
+#[cfg(not(feature = "chinese-pinyin"))]
 #[actix_rt::test]
 async fn test_kanji_language_detection() {
    let server = Server::new().await;
--- a/meilisearch/tests/search/snapshots/errors.rs/distinct_at_search_time/task-succeed.snap
+++ b/meilisearch/tests/search/snapshots/errors.rs/distinct_at_search_time/task-succeed.snap
@ -2,7 +2,7 @@
 source: meilisearch/tests/search/errors.rs
 ---
 {
-  "uid": 0,
+  "uid": "[uid]",
  "indexUid": "tamo",
  "status": "succeeded",
  "type": "indexCreation",
--- a/meilisearch/tests/settings/get_settings.rs
+++ b/meilisearch/tests/settings/get_settings.rs
@ -9,6 +9,7 @@ static DEFAULT_SETTINGS_VALUES: Lazy<HashMap<&'static str, Value>> = Lazy::new(|
    let mut map = HashMap::new();
    map.insert("displayed_attributes", json!(["*"]));
    map.insert("searchable_attributes", json!(["*"]));
+    map.insert("localized_attributes", json!(null));
    map.insert("filterable_attributes", json!([]));
    map.insert("distinct_attribute", json!(null));
    map.insert(
@ -409,6 +410,7 @@ macro_rules! test_setting_routes {
 test_setting_routes!(
    filterable_attributes put,
    displayed_attributes put,
+    localized_attributes put,
    searchable_attributes put,
    distinct_attribute put,
    stop_words put,
--- a/meilisearch/tests/vector/intel_gen.txt.gz
+++ b/meilisearch/tests/vector/intel_gen.txt.gz
--- a/meilisearch/tests/vector/mod.rs
+++ b/meilisearch/tests/vector/mod.rs
@ -1,3 +1,4 @@
+mod openai;
 mod rest;
 mod settings;

@ -10,6 +11,22 @@ use crate::common::index::Index;
 use crate::common::{default_settings, GetAllDocumentsOptions, Server};
 use crate::json;

+async fn get_server_vector() -> Server {
+    let server = Server::new().await;
+    let (value, code) = server.set_features(json!({"vectorStore": true})).await;
+    snapshot!(code, @"200 OK");
+    snapshot!(value, @r###"
+  {
+    "vectorStore": true,
+    "metrics": false,
+    "logsRoute": false,
+    "editDocumentsByFunction": false,
+    "containsFilter": false
+  }
+  "###);
+    server
+}
+
 #[actix_rt::test]
 async fn add_remove_user_provided() {
    let server = Server::new().await;
--- a/meilisearch/tests/vector/openai.rs
+++ b/meilisearch/tests/vector/openai.rs
--- a/meilisearch/tests/vector/openai_responses.json.gz
+++ b/meilisearch/tests/vector/openai_responses.json.gz
--- a/meilisearch/tests/vector/openai_tokenized_responses.json.gz
+++ b/meilisearch/tests/vector/openai_tokenized_responses.json.gz
--- a/meilisearch/tests/vector/rest.rs
+++ b/meilisearch/tests/vector/rest.rs
@ -5,9 +5,9 @@ use reqwest::IntoUrl;
 use wiremock::matchers::{method, path};
 use wiremock::{Mock, MockServer, Request, ResponseTemplate};

-use crate::common::{Server, Value};
+use crate::common::Value;
 use crate::json;
-use crate::vector::GetAllDocumentsOptions;
+use crate::vector::{get_server_vector, GetAllDocumentsOptions};

 async fn create_mock() -> (MockServer, Value) {
    let mock_server = MockServer::start().await;
@ -265,22 +265,6 @@ async fn dummy_testing_the_mock() {
    snapshot!(body, @r###"{"data":[4,4,4]}"###);
 }

-async fn get_server_vector() -> Server {
-    let server = Server::new().await;
-    let (value, code) = server.set_features(json!({"vectorStore": true})).await;
-    snapshot!(code, @"200 OK");
-    snapshot!(value, @r###"
-    {
-      "vectorStore": true,
-      "metrics": false,
-      "logsRoute": false,
-      "editDocumentsByFunction": false,
-      "containsFilter": false
-    }
-    "###);
-    server
-}
-
 #[actix_rt::test]
 async fn bad_request() {
    let (mock, _setting) = create_mock().await;
@ -1816,7 +1800,7 @@ async fn server_custom_header() {
        }
      },
      "error": {
-        "message": "Error while generating embeddings: runtime error: could not determine model dimensions:\n  - test embedding failed with user error: could not authenticate against embedding server\n  - server replied with `{\"error\":\"missing header 'my-nonstandard-auth'\"}`",
+        "message": "Error while generating embeddings: runtime error: could not determine model dimensions:\n  - test embedding failed with user error: could not authenticate against embedding server\n  - server replied with `{\"error\":\"missing header 'my-nonstandard-auth'\"}`\n  - Hint: Check the `apiKey` parameter in the embedder configuration",
        "code": "vector_embedding_error",
        "type": "invalid_request",
        "link": "https://docs.meilisearch.com/errors#vector_embedding_error"
@ -1858,7 +1842,7 @@ async fn server_custom_header() {
        }
      },
      "error": {
-        "message": "Error while generating embeddings: runtime error: could not determine model dimensions:\n  - test embedding failed with user error: could not authenticate against embedding server\n  - server replied with `{\"error\":\"thou shall not pass, Balrog\"}`",
+        "message": "Error while generating embeddings: runtime error: could not determine model dimensions:\n  - test embedding failed with user error: could not authenticate against embedding server\n  - server replied with `{\"error\":\"thou shall not pass, Balrog\"}`\n  - Hint: Check the `apiKey` parameter in the embedder configuration",
        "code": "vector_embedding_error",
        "type": "invalid_request",
        "link": "https://docs.meilisearch.com/errors#vector_embedding_error"
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@ -9,7 +9,6 @@ use heed::{CompactionOption, Database, RoTxn, RwTxn, Unspecified};
 use roaring::RoaringBitmap;
 use rstar::RTree;
 use serde::{Deserialize, Serialize};
-use time::OffsetDateTime;

 use crate::documents::PrimaryKey;
 use crate::error::{InternalError, UserError};
@ -173,8 +172,8 @@ impl Index {
    pub fn new_with_creation_dates<P: AsRef<Path>>(
        mut options: heed::EnvOpenOptions,
        path: P,
-        created_at: OffsetDateTime,
-        updated_at: OffsetDateTime,
+        created_at: time::OffsetDateTime,
+        updated_at: time::OffsetDateTime,
    ) -> Result<Index> {
        use db_name::*;

@ -256,22 +255,22 @@ impl Index {
    }

    pub fn new<P: AsRef<Path>>(options: heed::EnvOpenOptions, path: P) -> Result<Index> {
-        let now = OffsetDateTime::now_utc();
+        let now = time::OffsetDateTime::now_utc();
        Self::new_with_creation_dates(options, path, now, now)
    }

    fn set_creation_dates(
        env: &heed::Env,
        main: Database<Unspecified, Unspecified>,
-        created_at: OffsetDateTime,
-        updated_at: OffsetDateTime,
+        created_at: time::OffsetDateTime,
+        updated_at: time::OffsetDateTime,
    ) -> heed::Result<()> {
        let mut txn = env.write_txn()?;
        // The db was just created, we update its metadata with the relevant information.
        let main = main.remap_types::<Str, SerdeJson<OffsetDateTime>>();
        if main.get(&txn, main_key::CREATED_AT_KEY)?.is_none() {
-            main.put(&mut txn, main_key::UPDATED_AT_KEY, &updated_at)?;
-            main.put(&mut txn, main_key::CREATED_AT_KEY, &created_at)?;
+            main.put(&mut txn, main_key::UPDATED_AT_KEY, &OffsetDateTime(updated_at))?;
+            main.put(&mut txn, main_key::CREATED_AT_KEY, &OffsetDateTime(created_at))?;
            txn.commit()?;
        }
        Ok(())
@ -371,7 +370,7 @@ impl Index {
        wtxn: &mut RwTxn<'_>,
        primary_key: &str,
    ) -> heed::Result<()> {
-        self.set_updated_at(wtxn, &OffsetDateTime::now_utc())?;
+        self.set_updated_at(wtxn, &time::OffsetDateTime::now_utc())?;
        self.main.remap_types::<Str, Str>().put(wtxn, main_key::PRIMARY_KEY_KEY, primary_key)
    }

@ -1323,7 +1322,7 @@ impl Index {
    }

    /// Returns the index creation time.
-    pub fn created_at(&self, rtxn: &RoTxn<'_>) -> Result<OffsetDateTime> {
+    pub fn created_at(&self, rtxn: &RoTxn<'_>) -> Result<time::OffsetDateTime> {
        Ok(self
            .main
            .remap_types::<Str, SerdeJson<OffsetDateTime>>()
@ -1331,11 +1330,12 @@ impl Index {
            .ok_or(InternalError::DatabaseMissingEntry {
                db_name: db_name::MAIN,
                key: Some(main_key::CREATED_AT_KEY),
-            })?)
+            })?
+            .0)
    }

    /// Returns the index last updated time.
-    pub fn updated_at(&self, rtxn: &RoTxn<'_>) -> Result<OffsetDateTime> {
+    pub fn updated_at(&self, rtxn: &RoTxn<'_>) -> Result<time::OffsetDateTime> {
        Ok(self
            .main
            .remap_types::<Str, SerdeJson<OffsetDateTime>>()
@ -1343,18 +1343,19 @@ impl Index {
            .ok_or(InternalError::DatabaseMissingEntry {
                db_name: db_name::MAIN,
                key: Some(main_key::UPDATED_AT_KEY),
-            })?)
+            })?
+            .0)
    }

    pub(crate) fn set_updated_at(
        &self,
        wtxn: &mut RwTxn<'_>,
-        time: &OffsetDateTime,
+        time: &time::OffsetDateTime,
    ) -> heed::Result<()> {
        self.main.remap_types::<Str, SerdeJson<OffsetDateTime>>().put(
            wtxn,
            main_key::UPDATED_AT_KEY,
-            time,
+            &OffsetDateTime(*time),
        )
    }

@ -1681,6 +1682,10 @@ pub struct IndexEmbeddingConfig {
    pub user_provided: RoaringBitmap,
 }

+#[derive(Serialize, Deserialize)]
+#[serde(transparent)]
+struct OffsetDateTime(#[serde(with = "time::serde::rfc3339")] time::OffsetDateTime);
+
 #[cfg(test)]
 pub(crate) mod tests {
    use std::collections::HashSet;
--- a/milli/src/localized_attributes_rules.rs
+++ b/milli/src/localized_attributes_rules.rs
@ -90,6 +90,21 @@ impl LocalizedFieldIds {
    pub fn locales(&self, fields_id: FieldId) -> Option<&[Language]> {
        self.field_id_to_locales.get(&fields_id).map(Vec::as_slice)
    }
+
+    pub fn all_locales(&self) -> Vec<Language> {
+        let mut locales = Vec::new();
+        for field_locales in self.field_id_to_locales.values() {
+            if !field_locales.is_empty() {
+                locales.extend(field_locales);
+            } else {
+                // If a field has no locales, we consider it as not localized
+                return Vec::new();
+            }
+        }
+        locales.sort();
+        locales.dedup();
+        locales
+    }
 }

 #[cfg(test)]
--- a/milli/src/search/facet/search.rs
+++ b/milli/src/search/facet/search.rs
@ -339,10 +339,18 @@ impl ValuesCollection {
 fn normalize_facet_string(facet_string: &str, locales: Option<&[Language]>) -> String {
    let options = NormalizerOption { lossy: true, ..Default::default() };
    let mut detection = StrDetection::new(facet_string, locales);
+
+    // Detect the language of the facet string only if several locales are explicitly provided.
+    let language = match locales {
+        Some(&[language]) => Some(language),
+        Some(multiple_locales) if multiple_locales.len() > 1 => detection.language(),
+        _ => None,
+    };
+
    let token = Token {
        lemma: std::borrow::Cow::Borrowed(facet_string),
        script: detection.script(),
-        language: detection.language(),
+        language,
        ..Default::default()
    };

--- a/milli/src/search/mod.rs
+++ b/milli/src/search/mod.rs
@ -360,6 +360,7 @@ mod test {
    use super::*;

    #[cfg(feature = "japanese")]
+    #[cfg(not(feature = "chinese-pinyin"))]
    #[test]
    fn test_kanji_language_detection() {
        use crate::index::tests::TempIndex;
--- a/milli/src/search/new/db_cache.rs
+++ b/milli/src/search/new/db_cache.rs
@ -110,18 +110,18 @@ impl<'ctx> DatabaseCache<'ctx> {
            .map_err(Into::into)
    }

-    fn get_value_from_keys<'v, K1, KC, DC>(
+    fn get_value_from_keys<'v, K1, KC>(
        txn: &'ctx RoTxn<'_>,
        cache_key: K1,
        db_keys: &'v [KC::EItem],
        cache: &mut FxHashMap<K1, Option<Cow<'ctx, [u8]>>>,
        db: Database<KC, Bytes>,
+        universe: Option<&RoaringBitmap>,
        merger: MergeFn,
-    ) -> Result<Option<DC::DItem>>
+    ) -> Result<Option<RoaringBitmap>>
    where
        K1: Copy + Eq + Hash,
        KC: BytesEncode<'v>,
-        DC: BytesDecodeOwned,
        KC::EItem: Sized,
    {
        if let Entry::Vacant(entry) = cache.entry(cache_key) {
@ -146,16 +146,22 @@ impl<'ctx> DatabaseCache<'ctx> {
            entry.insert(bitmap_ptr);
        }

-        match cache.get(&cache_key).unwrap() {
-            Some(Cow::Borrowed(bytes)) => DC::bytes_decode_owned(bytes)
+        let bitmap_bytes = match cache.get(&cache_key).unwrap() {
+            Some(Cow::Borrowed(bytes)) => bytes,
+            Some(Cow::Owned(bytes)) => bytes.as_slice(),
+            None => return Ok(None),
+        };
+
+        match (bitmap_bytes, universe) {
+            (bytes, Some(universe)) => {
+                CboRoaringBitmapCodec::intersection_with_serialized(bytes, universe)
+                    .map(Some)
+                    .map_err(Into::into)
+            }
+            (bytes, None) => CboRoaringBitmapCodec::bytes_decode_owned(bytes)
                .map(Some)
                .map_err(heed::Error::Decoding)
                .map_err(Into::into),
-            Some(Cow::Owned(bytes)) => DC::bytes_decode_owned(bytes)
-                .map(Some)
-                .map_err(heed::Error::Decoding)
-                .map_err(Into::into),
-            None => Ok(None),
        }
    }
 }
@ -207,12 +213,13 @@ impl<'ctx> SearchContext<'ctx> {
                let keys: Vec<_> =
                    restricted_fids.tolerant.iter().map(|(fid, _)| (interned, *fid)).collect();

-                DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>(
+                DatabaseCache::get_value_from_keys::<_, _>(
                    self.txn,
                    word,
                    &keys[..],
                    &mut self.db_cache.word_docids,
                    self.index.word_fid_docids.remap_data_type::<Bytes>(),
+                    universe,
                    merge_cbo_roaring_bitmaps,
                )
            }
@ -238,12 +245,13 @@ impl<'ctx> SearchContext<'ctx> {
                let keys: Vec<_> =
                    restricted_fids.exact.iter().map(|(fid, _)| (interned, *fid)).collect();

-                DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>(
+                DatabaseCache::get_value_from_keys::<_, _>(
                    self.txn,
                    word,
                    &keys[..],
                    &mut self.db_cache.exact_word_docids,
                    self.index.word_fid_docids.remap_data_type::<Bytes>(),
+                    universe,
                    merge_cbo_roaring_bitmaps,
                )
            }
@ -294,12 +302,13 @@ impl<'ctx> SearchContext<'ctx> {
                let keys: Vec<_> =
                    restricted_fids.tolerant.iter().map(|(fid, _)| (interned, *fid)).collect();

-                DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>(
+                DatabaseCache::get_value_from_keys::<_, _>(
                    self.txn,
                    prefix,
                    &keys[..],
                    &mut self.db_cache.word_prefix_docids,
                    self.index.word_prefix_fid_docids.remap_data_type::<Bytes>(),
+                    universe,
                    merge_cbo_roaring_bitmaps,
                )
            }
@ -325,12 +334,13 @@ impl<'ctx> SearchContext<'ctx> {
                let keys: Vec<_> =
                    restricted_fids.exact.iter().map(|(fid, _)| (interned, *fid)).collect();

-                DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>(
+                DatabaseCache::get_value_from_keys::<_, _>(
                    self.txn,
                    prefix,
                    &keys[..],
                    &mut self.db_cache.exact_word_prefix_docids,
                    self.index.word_prefix_fid_docids.remap_data_type::<Bytes>(),
+                    universe,
                    merge_cbo_roaring_bitmaps,
                )
            }
--- a/milli/src/search/new/mod.rs
+++ b/milli/src/search/new/mod.rs
@ -49,6 +49,7 @@ pub use self::geo_sort::Strategy as GeoSortStrategy;
 use self::graph_based_ranking_rule::Words;
 use self::interner::Interned;
 use self::vector_sort::VectorSort;
+use crate::localized_attributes_rules::LocalizedFieldIds;
 use crate::score_details::{ScoreDetails, ScoringStrategy};
 use crate::search::new::distinct::apply_distinct_rule;
 use crate::vector::Embedder;
@ -671,9 +672,44 @@ pub fn execute_search(
            tokbuilder.words_dict(dictionary);
        }

-        if let Some(locales) = locales {
+        let db_locales;
+        match locales {
+            Some(locales) => {
+                if !locales.is_empty() {
                    tokbuilder.allow_list(locales);
                }
+            }
+            None => {
+                // If no locales are specified, we use the locales specified in the localized attributes rules
+                let localized_attributes_rules = ctx.index.localized_attributes_rules(ctx.txn)?;
+                let fields_ids_map = ctx.index.fields_ids_map(ctx.txn)?;
+                let searchable_fields = ctx.index.searchable_fields_ids(ctx.txn)?;
+
+                let localized_fields = match &ctx.restricted_fids {
+                    // if AttributeToSearchOn is set, use the restricted list of ids
+                    Some(restricted_fids) => {
+                        let iter = restricted_fids
+                            .exact
+                            .iter()
+                            .chain(restricted_fids.tolerant.iter())
+                            .map(|(fid, _)| *fid);
+
+                        LocalizedFieldIds::new(&localized_attributes_rules, &fields_ids_map, iter)
+                    }
+                    // Otherwise use the full list of ids coming from the index searchable fields
+                    None => LocalizedFieldIds::new(
+                        &localized_attributes_rules,
+                        &fields_ids_map,
+                        searchable_fields.into_iter(),
+                    ),
+                };
+
+                db_locales = localized_fields.all_locales();
+                if !db_locales.is_empty() {
+                    tokbuilder.allow_list(&db_locales);
+                }
+            }
+        };

        let tokenizer = tokbuilder.build();
        drop(entered);
--- a/milli/src/search/new/tests/mod.rs
+++ b/milli/src/search/new/tests/mod.rs
@ -6,6 +6,7 @@ pub mod exactness;
 pub mod geo_sort;
 pub mod integration;
 #[cfg(feature = "all-tokenizations")]
+#[cfg(not(feature = "chinese-pinyin"))]
 pub mod language;
 pub mod ngram_split_words;
 pub mod proximity;
--- a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs
@ -12,6 +12,7 @@ use heed::BytesEncode;
 use super::helpers::{create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters};
 use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec};
 use crate::heed_codec::{BEU16StrCodec, StrRefCodec};
+use crate::localized_attributes_rules::LocalizedFieldIds;
 use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
 use crate::update::index_documents::helpers::{
    merge_deladd_btreeset_string, merge_deladd_cbo_roaring_bitmaps,
@ -28,6 +29,116 @@ pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
    docid_fid_facet_string: grenad::Reader<R>,
    indexer: GrenadParameters,
    settings_diff: &InnerIndexSettingsDiff,
+) -> Result<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)> {
+    if settings_diff.settings_update_only() {
+        extract_facet_string_docids_settings(docid_fid_facet_string, indexer, settings_diff)
+    } else {
+        let localized_field_ids = &settings_diff.new.localized_faceted_fields_ids;
+        extract_facet_string_docids_document_update(
+            docid_fid_facet_string,
+            indexer,
+            localized_field_ids,
+        )
+    }
+}
+
+/// Extracts the facet string and the documents ids where this facet string appear.
+///
+/// Returns a grenad reader with the list of extracted facet strings and
+/// documents ids from the given chunk of docid facet string positions.
+#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
+fn extract_facet_string_docids_document_update<R: io::Read + io::Seek>(
+    docid_fid_facet_string: grenad::Reader<R>,
+    indexer: GrenadParameters,
+    localized_field_ids: &LocalizedFieldIds,
+) -> Result<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)> {
+    let max_memory = indexer.max_memory_by_thread();
+
+    let mut facet_string_docids_sorter = create_sorter(
+        grenad::SortAlgorithm::Stable,
+        merge_deladd_cbo_roaring_bitmaps,
+        indexer.chunk_compression_type,
+        indexer.chunk_compression_level,
+        indexer.max_nb_chunks,
+        max_memory.map(|m| m / 2),
+    );
+
+    let mut normalized_facet_string_docids_sorter = create_sorter(
+        grenad::SortAlgorithm::Stable,
+        merge_deladd_btreeset_string,
+        indexer.chunk_compression_type,
+        indexer.chunk_compression_level,
+        indexer.max_nb_chunks,
+        max_memory.map(|m| m / 2),
+    );
+
+    let mut buffer = Vec::new();
+    let mut cursor = docid_fid_facet_string.into_cursor()?;
+    while let Some((key, deladd_original_value_bytes)) = cursor.move_on_next()? {
+        let deladd_reader = KvReaderDelAdd::new(deladd_original_value_bytes);
+
+        let is_same_value = deladd_reader.get(DelAdd::Deletion).is_some()
+            && deladd_reader.get(DelAdd::Addition).is_some();
+
+        if is_same_value {
+            continue;
+        }
+
+        let (field_id_bytes, bytes) = try_split_array_at(key).unwrap();
+        let field_id = FieldId::from_be_bytes(field_id_bytes);
+
+        let (document_id_bytes, normalized_value_bytes) =
+            try_split_array_at::<_, 4>(bytes).unwrap();
+        let document_id = u32::from_be_bytes(document_id_bytes);
+
+        let normalized_value = str::from_utf8(normalized_value_bytes)?;
+
+        // Facet search normalization
+        {
+            let locales = localized_field_ids.locales(field_id);
+            let hyper_normalized_value = normalize_facet_string(normalized_value, locales);
+
+            let set = BTreeSet::from_iter(std::iter::once(normalized_value));
+
+            // as the facet string is the same, we can put the deletion and addition in the same obkv.
+            buffer.clear();
+            let mut obkv = KvWriterDelAdd::new(&mut buffer);
+            for (deladd_key, _) in deladd_reader.iter() {
+                let val = SerdeJson::bytes_encode(&set).map_err(heed::Error::Encoding)?;
+                obkv.insert(deladd_key, val)?;
+            }
+            obkv.finish()?;
+
+            let key: (u16, &str) = (field_id, hyper_normalized_value.as_ref());
+            let key_bytes = BEU16StrCodec::bytes_encode(&key).map_err(heed::Error::Encoding)?;
+            normalized_facet_string_docids_sorter.insert(key_bytes, &buffer)?;
+        }
+
+        let key = FacetGroupKey { field_id, level: 0, left_bound: normalized_value };
+        let key_bytes = FacetGroupKeyCodec::<StrRefCodec>::bytes_encode(&key).unwrap();
+
+        buffer.clear();
+        let mut obkv = KvWriterDelAdd::new(&mut buffer);
+        for (deladd_key, _) in deladd_reader.iter() {
+            obkv.insert(deladd_key, document_id.to_ne_bytes())?;
+        }
+        obkv.finish()?;
+        facet_string_docids_sorter.insert(&key_bytes, &buffer)?;
+    }
+
+    let normalized = sorter_into_reader(normalized_facet_string_docids_sorter, indexer)?;
+    sorter_into_reader(facet_string_docids_sorter, indexer).map(|s| (s, normalized))
+}
+
+/// Extracts the facet string and the documents ids where this facet string appear.
+///
+/// Returns a grenad reader with the list of extracted facet strings and
+/// documents ids from the given chunk of docid facet string positions.
+#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
+fn extract_facet_string_docids_settings<R: io::Read + io::Seek>(
+    docid_fid_facet_string: grenad::Reader<R>,
+    indexer: GrenadParameters,
+    settings_diff: &InnerIndexSettingsDiff,
 ) -> Result<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)> {
    let max_memory = indexer.max_memory_by_thread();

@ -60,6 +171,15 @@ pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
        let (field_id_bytes, bytes) = try_split_array_at(key).unwrap();
        let field_id = FieldId::from_be_bytes(field_id_bytes);

+        let old_locales = settings_diff.old.localized_faceted_fields_ids.locales(field_id);
+        let new_locales = settings_diff.new.localized_faceted_fields_ids.locales(field_id);
+
+        let are_same_locales = old_locales == new_locales;
+
+        if is_same_value && are_same_locales {
+            continue;
+        }
+
        let (document_id_bytes, normalized_value_bytes) =
            try_split_array_at::<_, 4>(bytes).unwrap();
        let document_id = u32::from_be_bytes(document_id_bytes);
@ -68,15 +188,17 @@ pub fn extract_facet_string_docids<R: io::Read + io::Seek>(

        // Facet search normalization
        {
-            let locales = settings_diff.old.localized_faceted_fields_ids.locales(field_id);
-            let old_hyper_normalized_value = normalize_facet_string(normalized_value, locales);
-            let locales = settings_diff.new.localized_faceted_fields_ids.locales(field_id);
-            let new_hyper_normalized_value = normalize_facet_string(normalized_value, locales);
+            let old_hyper_normalized_value = normalize_facet_string(normalized_value, old_locales);
+            let new_hyper_normalized_value = if are_same_locales {
+                &old_hyper_normalized_value
+            } else {
+                &normalize_facet_string(normalized_value, new_locales)
+            };

            let set = BTreeSet::from_iter(std::iter::once(normalized_value));

            // if the facet string is the same, we can put the deletion and addition in the same obkv.
-            if old_hyper_normalized_value == new_hyper_normalized_value {
+            if old_hyper_normalized_value == new_hyper_normalized_value.as_str() {
                // nothing to do if we delete and re-add the value.
                if is_same_value {
                    continue;
@ -148,12 +270,21 @@ pub fn extract_facet_string_docids<R: io::Read + io::Seek>(

 /// Normalizes the facet string and truncates it to the max length.
 fn normalize_facet_string(facet_string: &str, locales: Option<&[Language]>) -> String {
-    let options = NormalizerOption { lossy: true, ..Default::default() };
+    let options: NormalizerOption = NormalizerOption { lossy: true, ..Default::default() };
    let mut detection = StrDetection::new(facet_string, locales);
+
+    let script = detection.script();
+    // Detect the language of the facet string only if several locales are explicitly provided.
+    let language = match locales {
+        Some(&[language]) => Some(language),
+        Some(multiple_locales) if multiple_locales.len() > 1 => detection.language(),
+        _ => None,
+    };
+
    let token = Token {
        lemma: std::borrow::Cow::Borrowed(facet_string),
-        script: detection.script(),
-        language: detection.language(),
+        script,
+        language,
        ..Default::default()
    };

--- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs
+++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs
@ -9,7 +9,7 @@ use std::result::Result as StdResult;
 use bytemuck::bytes_of;
 use grenad::Sorter;
 use heed::BytesEncode;
-use itertools::{merge_join_by, EitherOrBoth};
+use itertools::{merge_join_by, EitherOrBoth, Itertools};
 use ordered_float::OrderedFloat;
 use roaring::RoaringBitmap;
 use serde_json::{from_slice, Value};
@ -317,11 +317,15 @@ fn deladd_obkv_cbo_roaring_bitmaps(
 }

 /// Truncates a string to the biggest valid LMDB key size.
-fn truncate_string(s: String) -> String {
-    s.char_indices()
-        .take_while(|(idx, _)| idx + 4 < MAX_FACET_VALUE_LENGTH)
-        .map(|(_, c)| c)
-        .collect()
+fn truncate_str(s: &str) -> &str {
+    let index = s
+        .char_indices()
+        .map(|(idx, _)| idx)
+        .chain(std::iter::once(s.len()))
+        .take_while(|idx| idx <= &MAX_FACET_VALUE_LENGTH)
+        .last();
+
+    &s[..index.unwrap_or(0)]
 }

 /// Computes the diff between both Del and Add numbers and
@ -401,37 +405,103 @@ where
    del_strings.dedup();
    add_strings.dedup();

+    let del_strings = del_strings.iter().chunk_by(|(normalized, _)| normalized);
+    let add_strings = add_strings.iter().chunk_by(|(normalized, _)| normalized);
+
    let merged_strings_iter = itertools::merge_join_by(
        del_strings.into_iter().filter(|(n, _)| !n.is_empty()),
        add_strings.into_iter().filter(|(n, _)| !n.is_empty()),
-        |del, add| del.cmp(add),
+        |(normalized_del, _), (normalized_add, _)| normalized_del.cmp(normalized_add),
    );

    // insert normalized and original facet string in sorter
    for eob in merged_strings_iter {
        key_buffer.truncate(TRUNCATE_SIZE);
+        let (side, normalized, original) = match eob {
+            EitherOrBoth::Both((normalized, del), (_, add)) => {
+                let merged_strings_iter =
+                    itertools::merge_join_by(del, add, |(_, original_del), (_, original_add)| {
+                        original_del.cmp(original_add)
+                    });
+
+                // FIXME: we're in a bit of a pickle here, because we're only saving **one** original value per side,
+                // but we possibly have multiple original values that changed in the case where the field is an
+                // array of multiple values that normalize to the same value.
+                // (e.g. "foo" = ["bar", "Bar", "bAr", "baR"]. I'm not judging why you would do that ¯\_(ツ)_/¯)
+                //
+                // We'll work best effort by ignoring when the same value appears in both sides, deleting the first
+                // value that is only in the old version, and adding the first value that is only in the new version
+                let mut obkv = KvWriterDelAdd::memory();
+                let mut del = None;
+                let mut add = None;
+                let mut both = None;
+
+                for eob in merged_strings_iter {
                    match eob {
-            EitherOrBoth::Both(_, _) => (), // no need to touch anything
-            EitherOrBoth::Left((normalized, original)) => {
-                let truncated = truncate_string(normalized);
+                        EitherOrBoth::Both((_normalized, original), _) => {
+                            both = match both {
+                                Some(both) => Some(both),
+                                None => Some(original),
+                            }
+                        }
+                        EitherOrBoth::Left((_normalized, original)) => {
+                            del = match del {
+                                Some(del) => Some(del),
+                                None => Some(original),
+                            };
+                        }
+                        EitherOrBoth::Right((_normalized, original)) => {
+                            add = match add {
+                                Some(add) => Some(add),
+                                None => Some(original),
+                            }
+                        }
+                    }
+                }
+
+                if let Some(del) = del {
+                    obkv.insert(DelAdd::Deletion, del)?;
+                }
+                if let Some(add) = add
+                    // prefer the newly added, but if there is none, keep a value in the list of values
+                    // since the normalized value appears both in old and new, we should never remove it.
+                    .or(both)
+                {
+                    obkv.insert(DelAdd::Addition, add)?;
+                }
+
+                let truncated = truncate_str(normalized);
+                key_buffer.extend_from_slice(truncated.as_bytes());
+
+                let bytes = obkv.into_inner()?;
+                fid_docid_facet_strings_sorter.insert(&key_buffer, bytes)?;
+                continue;
+            }
+            EitherOrBoth::Left((_normalized, mut original)) => {
+                // FIXME: we only consider the first value for the purpose of facet search
+                // another structure is needed, able to retain all originals associated with a normalized value.
+                let Some((normalized, original)) = original.next() else {
+                    continue;
+                };
+                (DelAdd::Deletion, normalized, original)
+            }
+            EitherOrBoth::Right((_normalized, mut original)) => {
+                // FIXME: we only consider the first value for the purpose of facet search
+                // another structure is needed, able to retain all originals associated with a normalized value.
+                let Some((normalized, original)) = original.next() else {
+                    continue;
+                };
+                (DelAdd::Addition, normalized, original)
+            }
+        };
+        let truncated = truncate_str(normalized);
        key_buffer.extend_from_slice(truncated.as_bytes());

        let mut obkv = KvWriterDelAdd::memory();
-                obkv.insert(DelAdd::Deletion, original)?;
+        obkv.insert(side, original)?;
        let bytes = obkv.into_inner()?;
        fid_docid_facet_strings_sorter.insert(&key_buffer, bytes)?;
    }
-            EitherOrBoth::Right((normalized, original)) => {
-                let truncated = truncate_string(normalized);
-                key_buffer.extend_from_slice(truncated.as_bytes());
-
-                let mut obkv = KvWriterDelAdd::memory();
-                obkv.insert(DelAdd::Addition, original)?;
-                let bytes = obkv.into_inner()?;
-                fid_docid_facet_strings_sorter.insert(&key_buffer, bytes)?;
-            }
-        }
-    }

    Ok(())
 }
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@ -290,7 +290,7 @@ where

                match result? {
                    DocumentEdition::Deleted(docid) => {
-                        documents_to_remove.push(docid);
+                        documents_to_remove.insert(docid);
                    }
                    DocumentEdition::Edited(new_document) => {
                        documents_batch_builder.append_json_object(&new_document)?;
--- a/milli/src/vector/error.rs
+++ b/milli/src/vector/error.rs
@ -62,8 +62,18 @@ pub enum EmbedErrorKind {
    RestResponseDeserialization(std::io::Error),
    #[error("expected a response containing {0} embeddings, got only {1}")]
    RestResponseEmbeddingCount(usize, usize),
-    #[error("could not authenticate against embedding server{}", option_info(.0.as_deref(), "server replied with "))]
-    RestUnauthorized(Option<String>),
+    #[error("could not authenticate against {embedding} server{server_reply}{hint}", embedding=match *.1 {
+        ConfigurationSource::User => "embedding",
+        ConfigurationSource::OpenAi => "OpenAI",
+        ConfigurationSource::Ollama => "ollama"
+    },
+    server_reply=option_info(.0.as_deref(), "server replied with "),
+    hint=match *.1 {
+        ConfigurationSource::User => "\n  - Hint: Check the `apiKey` parameter in the embedder configuration",
+        ConfigurationSource::OpenAi => "\n  - Hint: Check the `apiKey` parameter in the embedder configuration, and the `MEILI_OPENAI_API_KEY` and `OPENAI_API_KEY` environment variables",
+        ConfigurationSource::Ollama => "\n  - Hint: Check the `apiKey` parameter in the embedder configuration"
+    })]
+    RestUnauthorized(Option<String>, ConfigurationSource),
    #[error("sent too many requests to embedding server{}", option_info(.0.as_deref(), "server replied with "))]
    RestTooManyRequests(Option<String>),
    #[error("sent a bad request to embedding server{}{}",
@ -136,8 +146,14 @@ impl EmbedError {
        }
    }

-    pub(crate) fn rest_unauthorized(error_response: Option<String>) -> EmbedError {
-        Self { kind: EmbedErrorKind::RestUnauthorized(error_response), fault: FaultSource::User }
+    pub(crate) fn rest_unauthorized(
+        error_response: Option<String>,
+        configuration_source: ConfigurationSource,
+    ) -> EmbedError {
+        Self {
+            kind: EmbedErrorKind::RestUnauthorized(error_response, configuration_source),
+            fault: FaultSource::User,
+        }
    }

    pub(crate) fn rest_too_many_requests(error_response: Option<String>) -> EmbedError {
--- a/milli/src/vector/openai.rs
+++ b/milli/src/vector/openai.rs
@ -183,7 +183,7 @@ impl Embedder {

        let rest_embedder = RestEmbedder::new(
            RestEmbedderOptions {
-                api_key: Some(api_key.clone()),
+                api_key: (!api_key.is_empty()).then(|| api_key.clone()),
                distribution: None,
                dimensions: Some(options.dimensions()),
                url,
--- a/milli/src/vector/rest.rs
+++ b/milli/src/vector/rest.rs
@ -275,7 +275,10 @@ fn check_response(
        Err(ureq::Error::Status(code, response)) => {
            let error_response: Option<String> = response.into_string().ok();
            Err(match code {
-                401 => Retry::give_up(EmbedError::rest_unauthorized(error_response)),
+                401 => Retry::give_up(EmbedError::rest_unauthorized(
+                    error_response,
+                    configuration_source,
+                )),
                429 => Retry::rate_limited(EmbedError::rest_too_many_requests(error_response)),
                400 => Retry::give_up(EmbedError::rest_bad_request(
                    error_response,