Merge #4080

4080: Bring back changes from v1.4.0 into main r=Kerollmops a=curquiza Co-authored-by: ManyTheFish <many@meilisearch.com> Co-authored-by: Clément Renault <clement@meilisearch.com> Co-authored-by: Kerollmops <clement@meilisearch.com> Co-authored-by: meili-bors[bot] <89034592+meili-bors[bot]@users.noreply.github.com> Co-authored-by: curquiza <curquiza@users.noreply.github.com> Co-authored-by: Tamo <tamo@meilisearch.com> Co-authored-by: curquiza <clementine@meilisearch.com> Co-authored-by: Vivek Kumar <vivek.26@outlook.com> Co-authored-by: dogukanakkaya <doguakkaya27@hotmail.com>
2024-11-22 18:17:39 +08:00 · 2023-09-26 08:13:49 +00:00 · 2023-09-26 08:13:49 +00:00 · 86b314626d
commit 86b314626d
parent bb79bdb3f8 76c05d1b20
16 changed files with 379 additions and 119 deletions
--- a/.github/workflows/publish-apt-brew-pkg.yml
+++ b/.github/workflows/publish-apt-brew-pkg.yml
@ -53,5 +53,6 @@ jobs:
        uses: mislav/bump-homebrew-formula-action@v2
        with:
          formula-name: meilisearch
          formula-path: Formula/m/meilisearch.rb
        env:
          COMMITTER_TOKEN: ${{ secrets.HOMEBREW_COMMITTER_TOKEN }}
--- a/Cargo.lock
+++ b/Cargo.lock
@ -1444,6 +1444,7 @@ dependencies = [
 "insta",
 "nom",
 "nom_locate",
 "unescaper",
 ]
 [[package]]
@ -4180,6 +4181,15 @@ version = "0.1.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ed646292ffc8188ef8ea4d1e0e0150fb15a5c2e12ad9b8fc191ae7a8a7f3c4b9"
 [[package]]
 name = "unescaper"
 version = "0.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a96a44ae11e25afb520af4534fd7b0bd8cd613e35a78def813b8cf41631fa3c8"
 dependencies = [
 "thiserror",
 ]
 [[package]]
 name = "unicase"
 version = "2.6.0"
--- a/filter-parser/Cargo.toml
+++ b/filter-parser/Cargo.toml
@ -14,6 +14,7 @@ license.workspace = true
 [dependencies]
 nom = "7.1.3"
 nom_locate = "4.1.0"
 unescaper = "0.1.2"
 [dev-dependencies]
 insta = "1.29.0"
--- a/filter-parser/src/error.rs
+++ b/filter-parser/src/error.rs
@ -62,6 +62,7 @@ pub enum ErrorKind<'a> {
    MisusedGeoRadius,
    MisusedGeoBoundingBox,
    InvalidPrimary,
    InvalidEscapedNumber,
    ExpectedEof,
    ExpectedValue(ExpectedValueKind),
    MalformedValue,
@ -147,6 +148,9 @@ impl<'a> Display for Error<'a> {
                let text = if input.trim().is_empty() { "but instead got nothing.".to_string() } else { format!("at `{}`.", escaped_input) };
                writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` {}", text)?
            }
            ErrorKind::InvalidEscapedNumber => {
                writeln!(f, "Found an invalid escaped sequence number: `{}`.", escaped_input)?
            }
            ErrorKind::ExpectedEof => {
                writeln!(f, "Found unexpected characters at the end of the filter: `{}`. You probably forgot an `OR` or an `AND` rule.", escaped_input)?
            }
--- a/filter-parser/src/lib.rs
+++ b/filter-parser/src/lib.rs
@ -545,6 +545,8 @@ impl<'a> std::fmt::Display for Token<'a> {
 #[cfg(test)]
 pub mod tests {
    use FilterCondition as Fc;
    use super::*;
    /// Create a raw [Token]. You must specify the string that appear BEFORE your element followed by your element
@ -556,14 +558,22 @@ pub mod tests {
        unsafe { Span::new_from_raw_offset(offset, lines as u32, value, "") }.into()
    }
    fn p(s: &str) -> impl std::fmt::Display + '_ {
        Fc::parse(s).unwrap().unwrap()
    }
    #[test]
    fn parse_escaped() {
        insta::assert_display_snapshot!(p(r#"title = 'foo\\'"#), @r#"{title} = {foo\}"#);
        insta::assert_display_snapshot!(p(r#"title = 'foo\\\\'"#), @r#"{title} = {foo\\}"#);
        insta::assert_display_snapshot!(p(r#"title = 'foo\\\\\\'"#), @r#"{title} = {foo\\\}"#);
        insta::assert_display_snapshot!(p(r#"title = 'foo\\\\\\\\'"#), @r#"{title} = {foo\\\\}"#);
        // but it also works with other sequencies
        insta::assert_display_snapshot!(p(r#"title = 'foo\x20\n\t\"\'"'"#), @"{title} = {foo \n\t\"\'\"}");
    }
    #[test]
    fn parse() {
        use FilterCondition as Fc;
        fn p(s: &str) -> impl std::fmt::Display + '_ {
            Fc::parse(s).unwrap().unwrap()
        }
        // Test equal
        insta::assert_display_snapshot!(p("channel = Ponce"), @"{channel} = {Ponce}");
        insta::assert_display_snapshot!(p("subscribers = 12"), @"{subscribers} = {12}");
--- a/filter-parser/src/value.rs
+++ b/filter-parser/src/value.rs
@ -171,7 +171,24 @@ pub fn parse_value(input: Span) -> IResult<Token> {
        })
    })?;
-    Ok((input, value))
+    match unescaper::unescape(value.value()) {
        Ok(content) => {
            if content.len() != value.value().len() {
                Ok((input, Token::new(value.original_span(), Some(content))))
            } else {
                Ok((input, value))
            }
        }
        Err(unescaper::Error::IncompleteStr(_)) => Err(nom::Err::Incomplete(nom::Needed::Unknown)),
        Err(unescaper::Error::ParseIntError { .. }) => Err(nom::Err::Error(Error::new_from_kind(
            value.original_span(),
            ErrorKind::InvalidEscapedNumber,
        ))),
        Err(unescaper::Error::InvalidChar { .. }) => Err(nom::Err::Error(Error::new_from_kind(
            value.original_span(),
            ErrorKind::MalformedValue,
        ))),
    }
 }
 fn is_value_component(c: char) -> bool {
@ -318,17 +335,17 @@ pub mod test {
            ("\"cha'nnel\"", "cha'nnel", false),
            ("I'm tamo", "I", false),
            // escaped thing but not quote
-            (r#""\\""#, r#"\\"#, false),
+            (r#""\\""#, r#"\"#, true),
-            (r#""\\\\\\""#, r#"\\\\\\"#, false),
+            (r#""\\\\\\""#, r#"\\\"#, true),
-            (r#""aa\\aa""#, r#"aa\\aa"#, false),
+            (r#""aa\\aa""#, r#"aa\aa"#, true),
            // with double quote
            (r#""Hello \"world\"""#, r#"Hello "world""#, true),
-            (r#""Hello \\\"world\\\"""#, r#"Hello \\"world\\""#, true),
+            (r#""Hello \\\"world\\\"""#, r#"Hello \"world\""#, true),
            (r#""I'm \"super\" tamo""#, r#"I'm "super" tamo"#, true),
            (r#""\"\"""#, r#""""#, true),
            // with simple quote
            (r#"'Hello \'world\''"#, r#"Hello 'world'"#, true),
-            (r#"'Hello \\\'world\\\''"#, r#"Hello \\'world\\'"#, true),
+            (r#"'Hello \\\'world\\\''"#, r#"Hello \'world\'"#, true),
            (r#"'I\'m "super" tamo'"#, r#"I'm "super" tamo"#, true),
            (r#"'\'\''"#, r#"''"#, true),
        ];
@ -350,7 +367,14 @@ pub mod test {
                "Filter `{}` was not supposed to be escaped",
                input
            );
-            assert_eq!(token.value(), expected, "Filter `{}` failed.", input);
+            assert_eq!(
                token.value(),
                expected,
                "Filter `{}` failed by giving `{}` instead of `{}`.",
                input,
                token.value(),
                expected
            );
        }
    }
--- a/index-scheduler/src/batch.rs
+++ b/index-scheduler/src/batch.rs
@ -67,10 +67,6 @@ pub(crate) enum Batch {
        op: IndexOperation,
        must_create_index: bool,
    },
    IndexDocumentDeletionByFilter {
        index_uid: String,
        task: Task,
    },
    IndexCreation {
        index_uid: String,
        primary_key: Option<String>,
@ -114,6 +110,10 @@ pub(crate) enum IndexOperation {
        documents: Vec<Vec<String>>,
        tasks: Vec<Task>,
    },
    IndexDocumentDeletionByFilter {
        index_uid: String,
        task: Task,
    },
    DocumentClear {
        index_uid: String,
        tasks: Vec<Task>,
@ -155,7 +155,6 @@ impl Batch {
            | Batch::TaskDeletion(task)
            | Batch::Dump(task)
            | Batch::IndexCreation { task, .. }
            | Batch::IndexDocumentDeletionByFilter { task, .. }
            | Batch::IndexUpdate { task, .. } => vec![task.uid],
            Batch::SnapshotCreation(tasks) | Batch::IndexDeletion { tasks, .. } => {
                tasks.iter().map(|task| task.uid).collect()
@ -167,6 +166,7 @@ impl Batch {
                | IndexOperation::DocumentClear { tasks, .. } => {
                    tasks.iter().map(|task| task.uid).collect()
                }
                IndexOperation::IndexDocumentDeletionByFilter { task, .. } => vec![task.uid],
                IndexOperation::SettingsAndDocumentOperation {
                    document_import_tasks: tasks,
                    settings_tasks: other,
@ -194,8 +194,7 @@ impl Batch {
            IndexOperation { op, .. } => Some(op.index_uid()),
            IndexCreation { index_uid, .. }
            | IndexUpdate { index_uid, .. }
-            | IndexDeletion { index_uid, .. }
+            | IndexDeletion { index_uid, .. } => Some(index_uid),
            | IndexDocumentDeletionByFilter { index_uid, .. } => Some(index_uid),
        }
    }
 }
@ -205,6 +204,7 @@ impl IndexOperation {
        match self {
            IndexOperation::DocumentOperation { index_uid, .. }
            | IndexOperation::DocumentDeletion { index_uid, .. }
            | IndexOperation::IndexDocumentDeletionByFilter { index_uid, .. }
            | IndexOperation::DocumentClear { index_uid, .. }
            | IndexOperation::Settings { index_uid, .. }
            | IndexOperation::DocumentClearAndSetting { index_uid, .. }
@ -239,9 +239,12 @@ impl IndexScheduler {
                let task = self.get_task(rtxn, id)?.ok_or(Error::CorruptedTaskQueue)?;
                match &task.kind {
                    KindWithContent::DocumentDeletionByFilter { index_uid, .. } => {
-                        Ok(Some(Batch::IndexDocumentDeletionByFilter {
+                        Ok(Some(Batch::IndexOperation {
-                            index_uid: index_uid.clone(),
+                            op: IndexOperation::IndexDocumentDeletionByFilter {
-                            task,
+                                index_uid: index_uid.clone(),
                                task,
                            },
                            must_create_index: false,
                        }))
                    }
                    _ => unreachable!(),
@ -896,51 +899,6 @@ impl IndexScheduler {
                Ok(tasks)
            }
            Batch::IndexDocumentDeletionByFilter { mut task, index_uid: _ } => {
                let (index_uid, filter) =
                    if let KindWithContent::DocumentDeletionByFilter { index_uid, filter_expr } =
                        &task.kind
                    {
                        (index_uid, filter_expr)
                    } else {
                        unreachable!()
                    };
                let index = {
                    let rtxn = self.env.read_txn()?;
                    self.index_mapper.index(&rtxn, index_uid)?
                };
                let deleted_documents = delete_document_by_filter(filter, index);
                let original_filter = if let Some(Details::DocumentDeletionByFilter {
                    original_filter,
                    deleted_documents: _,
                }) = task.details
                {
                    original_filter
                } else {
                    // In the case of a `documentDeleteByFilter` the details MUST be set
                    unreachable!();
                };
                match deleted_documents {
                    Ok(deleted_documents) => {
                        task.status = Status::Succeeded;
                        task.details = Some(Details::DocumentDeletionByFilter {
                            original_filter,
                            deleted_documents: Some(deleted_documents),
                        });
                    }
                    Err(e) => {
                        task.status = Status::Failed;
                        task.details = Some(Details::DocumentDeletionByFilter {
                            original_filter,
                            deleted_documents: Some(0),
                        });
                        task.error = Some(e.into());
                    }
                }
                Ok(vec![task])
            }
            Batch::IndexCreation { index_uid, primary_key, task } => {
                let wtxn = self.env.write_txn()?;
                if self.index_mapper.exists(&wtxn, &index_uid)? {
@ -1299,6 +1257,47 @@ impl IndexScheduler {
                Ok(tasks)
            }
            IndexOperation::IndexDocumentDeletionByFilter { mut task, index_uid: _ } => {
                let filter =
                    if let KindWithContent::DocumentDeletionByFilter { filter_expr, .. } =
                        &task.kind
                    {
                        filter_expr
                    } else {
                        unreachable!()
                    };
                let deleted_documents = delete_document_by_filter(index_wtxn, filter, index);
                let original_filter = if let Some(Details::DocumentDeletionByFilter {
                    original_filter,
                    deleted_documents: _,
                }) = task.details
                {
                    original_filter
                } else {
                    // In the case of a `documentDeleteByFilter` the details MUST be set
                    unreachable!();
                };
                match deleted_documents {
                    Ok(deleted_documents) => {
                        task.status = Status::Succeeded;
                        task.details = Some(Details::DocumentDeletionByFilter {
                            original_filter,
                            deleted_documents: Some(deleted_documents),
                        });
                    }
                    Err(e) => {
                        task.status = Status::Failed;
                        task.details = Some(Details::DocumentDeletionByFilter {
                            original_filter,
                            deleted_documents: Some(0),
                        });
                        task.error = Some(e.into());
                    }
                }
                Ok(vec![task])
            }
            IndexOperation::Settings { index_uid: _, settings, mut tasks } => {
                let indexer_config = self.index_mapper.indexer_config();
                let mut builder = milli::update::Settings::new(index_wtxn, index, indexer_config);
@ -1498,23 +1497,22 @@ impl IndexScheduler {
    }
 }
-fn delete_document_by_filter(filter: &serde_json::Value, index: Index) -> Result<u64> {
+fn delete_document_by_filter<'a>(
    wtxn: &mut RwTxn<'a, '_>,
    filter: &serde_json::Value,
    index: &'a Index,
 ) -> Result<u64> {
    let filter = Filter::from_json(filter)?;
    Ok(if let Some(filter) = filter {
-        let mut wtxn = index.write_txn()?;
+        let candidates = filter.evaluate(wtxn, index).map_err(|err| match err {
        let candidates = filter.evaluate(&wtxn, &index).map_err(|err| match err {
            milli::Error::UserError(milli::UserError::InvalidFilter(_)) => {
                Error::from(err).with_custom_error_code(Code::InvalidDocumentFilter)
            }
            e => e.into(),
        })?;
-        let mut delete_operation = DeleteDocuments::new(&mut wtxn, &index)?;
+        let mut delete_operation = DeleteDocuments::new(wtxn, index)?;
        delete_operation.delete_documents(&candidates);
-        let deleted_documents =
+        delete_operation.execute().map(|result| result.deleted_documents)?
            delete_operation.execute().map(|result| result.deleted_documents)?;
        wtxn.commit()?;
        deleted_documents
    } else {
        0
    })
--- a/meilisearch/src/routes/swap_indexes.rs
+++ b/meilisearch/src/routes/swap_indexes.rs
@ -60,8 +60,7 @@ pub async fn swap_indexes(
    }
    let task = KindWithContent::IndexSwap { swaps };
-
+    let task: SummarizedTaskView =
-    let task = index_scheduler.register(task)?;
+        tokio::task::spawn_blocking(move || index_scheduler.register(task)).await??.into();
    let task: SummarizedTaskView = task.into();
    Ok(HttpResponse::Accepted().json(task))
 }
--- a/meilisearch/tests/documents/delete_documents.rs
+++ b/meilisearch/tests/documents/delete_documents.rs
@ -154,6 +154,19 @@ async fn delete_document_by_filter() {
        )
        .await;
    index.wait_task(1).await;
    let (stats, _) = index.stats().await;
    snapshot!(json_string!(stats), @r###"
    {
      "numberOfDocuments": 4,
      "isIndexing": false,
      "fieldDistribution": {
        "color": 3,
        "id": 4
      }
    }
    "###);
    let (response, code) =
        index.delete_document_by_filter(json!({ "filter": "color = blue"})).await;
    snapshot!(code, @"202 Accepted");
@ -188,6 +201,18 @@ async fn delete_document_by_filter() {
    }
    "###);
    let (stats, _) = index.stats().await;
    snapshot!(json_string!(stats), @r###"
    {
      "numberOfDocuments": 2,
      "isIndexing": false,
      "fieldDistribution": {
        "color": 1,
        "id": 2
      }
    }
    "###);
    let (documents, code) = index.get_all_documents(GetAllDocumentsOptions::default()).await;
    snapshot!(code, @"200 OK");
    snapshot!(json_string!(documents), @r###"
@ -241,6 +266,18 @@ async fn delete_document_by_filter() {
    }
    "###);
    let (stats, _) = index.stats().await;
    snapshot!(json_string!(stats), @r###"
    {
      "numberOfDocuments": 1,
      "isIndexing": false,
      "fieldDistribution": {
        "color": 1,
        "id": 1
      }
    }
    "###);
    let (documents, code) = index.get_all_documents(GetAllDocumentsOptions::default()).await;
    snapshot!(code, @"200 OK");
    snapshot!(json_string!(documents), @r###"
--- a/meilisearch/tests/search/mod.rs
+++ b/meilisearch/tests/search/mod.rs
@ -1104,3 +1104,59 @@ async fn camelcased_words() {
        })
        .await;
 }
 #[actix_rt::test]
 async fn simple_search_with_strange_synonyms() {
    let server = Server::new().await;
    let index = server.index("test");
    index.update_settings(json!({ "synonyms": {"&": ["to"], "to": ["&"]} })).await;
    let r = index.wait_task(0).await;
    meili_snap::snapshot!(r["status"], @r###""succeeded""###);
    let documents = DOCUMENTS.clone();
    index.add_documents(documents, None).await;
    index.wait_task(1).await;
    index
        .search(json!({"q": "How to train"}), |response, code| {
            meili_snap::snapshot!(code, @"200 OK");
            meili_snap::snapshot!(meili_snap::json_string!(response["hits"]), @r###"
            [
              {
                "title": "How to Train Your Dragon: The Hidden World",
                "id": "166428"
              }
            ]
            "###);
        })
        .await;
    index
        .search(json!({"q": "How & train"}), |response, code| {
            meili_snap::snapshot!(code, @"200 OK");
            meili_snap::snapshot!(meili_snap::json_string!(response["hits"]), @r###"
            [
              {
                "title": "How to Train Your Dragon: The Hidden World",
                "id": "166428"
              }
            ]
            "###);
        })
        .await;
    index
        .search(json!({"q": "to"}), |response, code| {
            meili_snap::snapshot!(code, @"200 OK");
            meili_snap::snapshot!(meili_snap::json_string!(response["hits"]), @r###"
            [
              {
                "title": "How to Train Your Dragon: The Hidden World",
                "id": "166428"
              }
            ]
            "###);
        })
        .await;
 }
--- a/milli/src/search/new/matches/mod.rs
+++ b/milli/src/search/new/matches/mod.rs
@ -418,19 +418,11 @@ impl<'t> Matcher<'t, '_> {
        } else {
            match &self.matches {
                Some((tokens, matches)) => {
                    // If the text has to be cropped,
                    // compute the best interval to crop around.
                    let matches = match format_options.crop {
                        Some(crop_size) if crop_size > 0 => {
                            self.find_best_match_interval(matches, crop_size)
                        }
                        _ => matches,
                    };
                    // If the text has to be cropped,
                    // crop around the best interval.
                    let (byte_start, byte_end) = match format_options.crop {
                        Some(crop_size) if crop_size > 0 => {
                            let matches = self.find_best_match_interval(matches, crop_size);
                            self.crop_bounds(tokens, matches, crop_size)
                        }
                        _ => (0, self.text.len()),
@ -450,6 +442,11 @@ impl<'t> Matcher<'t, '_> {
                        for m in matches {
                            let token = &tokens[m.token_position];
                            // skip matches out of the crop window.
                            if token.byte_start < byte_start || token.byte_end > byte_end {
                                continue;
                            }
                            if byte_index < token.byte_start {
                                formatted.push(&self.text[byte_index..token.byte_start]);
                            }
@ -800,6 +797,37 @@ mod tests {
        );
    }
    #[test]
    fn format_highlight_crop_phrase_query() {
        //! testing: https://github.com/meilisearch/meilisearch/issues/3975
        let temp_index = TempIndex::new();
        temp_index
            .add_documents(documents!([
                { "id": 1, "text": "The groundbreaking invention had the power to split the world between those who embraced progress and those who resisted change!" }
            ]))
            .unwrap();
        let rtxn = temp_index.read_txn().unwrap();
        let format_options = FormatOptions { highlight: true, crop: Some(10) };
        let text = "The groundbreaking invention had the power to split the world between those who embraced progress and those who resisted change!";
        let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "\"the world\"");
        let mut matcher = builder.build(text);
        // should return 10 words with a marker at the start as well the end, and the highlighted matches.
        insta::assert_snapshot!(
            matcher.format(format_options),
            @"…had the power to split <em>the</em> <em>world</em> between those who…"
        );
        let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "those \"and those\"");
        let mut matcher = builder.build(text);
        // should highlight "those" and the phrase "and those".
        insta::assert_snapshot!(
            matcher.format(format_options),
            @"…world between <em>those</em> who embraced progress <em>and</em> <em>those</em> who resisted…"
        );
    }
    #[test]
    fn smaller_crop_size() {
        //! testing: https://github.com/meilisearch/specifications/pull/120#discussion_r836536295
--- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
+++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
@ -226,9 +226,9 @@ fn process_tokens<'a>(
 ) -> impl Iterator<Item = (usize, Token<'a>)> {
    tokens
        .skip_while(|token| token.is_separator())
-        .scan((0, None), |(offset, prev_kind), token| {
+        .scan((0, None), |(offset, prev_kind), mut token| {
            match token.kind {
-                TokenKind::Word | TokenKind::StopWord | TokenKind::Unknown => {
+                TokenKind::Word | TokenKind::StopWord if !token.lemma().is_empty() => {
                    *offset += match *prev_kind {
                        Some(TokenKind::Separator(SeparatorKind::Hard)) => 8,
                        Some(_) => 1,
@ -244,7 +244,7 @@ fn process_tokens<'a>(
                {
                    *prev_kind = Some(token.kind);
                }
-                _ => (),
+                _ => token.kind = TokenKind::Unknown,
            }
            Some((*offset, token))
        })
--- a/milli/src/update/index_documents/extract/mod.rs
+++ b/milli/src/update/index_documents/extract/mod.rs
@ -59,7 +59,13 @@ pub(crate) fn data_from_obkv_documents(
    original_obkv_chunks
        .par_bridge()
        .map(|original_documents_chunk| {
-            send_original_documents_data(original_documents_chunk, lmdb_writer_sx.clone())
+            send_original_documents_data(
                original_documents_chunk,
                indexer,
                lmdb_writer_sx.clone(),
                vectors_field_id,
                primary_key_id,
            )
        })
        .collect::<Result<()>>()?;
@ -76,7 +82,6 @@ pub(crate) fn data_from_obkv_documents(
                    &faceted_fields,
                    primary_key_id,
                    geo_fields_ids,
                    vectors_field_id,
                    &stop_words,
                    &allowed_separators,
                    &dictionary,
@ -265,11 +270,33 @@ fn spawn_extraction_task<FE, FS, M>(
 /// - documents
 fn send_original_documents_data(
    original_documents_chunk: Result<grenad::Reader<File>>,
    indexer: GrenadParameters,
    lmdb_writer_sx: Sender<Result<TypedChunk>>,
    vectors_field_id: Option<FieldId>,
    primary_key_id: FieldId,
 ) -> Result<()> {
    let original_documents_chunk =
        original_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?;
    if let Some(vectors_field_id) = vectors_field_id {
        let documents_chunk_cloned = original_documents_chunk.clone();
        let lmdb_writer_sx_cloned = lmdb_writer_sx.clone();
        rayon::spawn(move || {
            let result = extract_vector_points(
                documents_chunk_cloned,
                indexer,
                primary_key_id,
                vectors_field_id,
            );
            let _ = match result {
                Ok(vector_points) => {
                    lmdb_writer_sx_cloned.send(Ok(TypedChunk::VectorPoints(vector_points)))
                }
                Err(error) => lmdb_writer_sx_cloned.send(Err(error)),
            };
        });
    }
    // TODO: create a custom internal error
    lmdb_writer_sx.send(Ok(TypedChunk::Documents(original_documents_chunk))).unwrap();
    Ok(())
@ -291,7 +318,6 @@ fn send_and_extract_flattened_documents_data(
    faceted_fields: &HashSet<FieldId>,
    primary_key_id: FieldId,
    geo_fields_ids: Option<(FieldId, FieldId)>,
    vectors_field_id: Option<FieldId>,
    stop_words: &Option<fst::Set<&[u8]>>,
    allowed_separators: &Option<&[&str]>,
    dictionary: &Option<&[&str]>,
@ -322,25 +348,6 @@ fn send_and_extract_flattened_documents_data(
        });
    }
    if let Some(vectors_field_id) = vectors_field_id {
        let documents_chunk_cloned = flattened_documents_chunk.clone();
        let lmdb_writer_sx_cloned = lmdb_writer_sx.clone();
        rayon::spawn(move || {
            let result = extract_vector_points(
                documents_chunk_cloned,
                indexer,
                primary_key_id,
                vectors_field_id,
            );
            let _ = match result {
                Ok(vector_points) => {
                    lmdb_writer_sx_cloned.send(Ok(TypedChunk::VectorPoints(vector_points)))
                }
                Err(error) => lmdb_writer_sx_cloned.send(Err(error)),
            };
        });
    }
    let (docid_word_positions_chunk, docid_fid_facet_values_chunks): (Result<_>, Result<_>) =
        rayon::join(
            || {
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@ -2550,6 +2550,25 @@ mod tests {
        db_snap!(index, word_position_docids, 3, @"74f556b91d161d997a89468b4da1cb8f");
    }
    /// Index multiple different number of vectors in documents.
    /// Vectors must be of the same length.
    #[test]
    fn test_multiple_vectors() {
        let index = TempIndex::new();
        index.add_documents(documents!([{"id": 0, "_vectors": [[0, 1, 2], [3, 4, 5]] }])).unwrap();
        index.add_documents(documents!([{"id": 1, "_vectors": [6, 7, 8] }])).unwrap();
        index
            .add_documents(
                documents!([{"id": 2, "_vectors": [[9, 10, 11], [12, 13, 14], [15, 16, 17]] }]),
            )
            .unwrap();
        let rtxn = index.read_txn().unwrap();
        let res = index.search(&rtxn).vector([0.0, 1.0, 2.0]).execute().unwrap();
        assert_eq!(res.documents_ids.len(), 3);
    }
    #[test]
    fn reproduce_the_bug() {
        /*
--- a/milli/src/update/settings.rs
+++ b/milli/src/update/settings.rs
@ -573,7 +573,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
                    tokenizer
                        .tokenize(text)
                        .filter_map(|token| {
-                            if token.is_word() {
+                            if token.is_word() && !token.lemma().is_empty() {
                                Some(token.lemma().to_string())
                            } else {
                                None
@ -608,13 +608,18 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
                for (word, synonyms) in user_synonyms {
                    // Normalize both the word and associated synonyms.
                    let normalized_word = normalize(&tokenizer, word);
-                    let normalized_synonyms =
+                    let normalized_synonyms: Vec<_> = synonyms
-                        synonyms.iter().map(|synonym| normalize(&tokenizer, synonym));
+                        .iter()
                        .map(|synonym| normalize(&tokenizer, synonym))
                        .filter(|synonym| !synonym.is_empty())
                        .collect();
                    // Store the normalized synonyms under the normalized word,
                    // merging the possible duplicate words.
-                    let entry = new_synonyms.entry(normalized_word).or_insert_with(Vec::new);
+                    if !normalized_word.is_empty() && !normalized_synonyms.is_empty() {
-                    entry.extend(normalized_synonyms);
+                        let entry = new_synonyms.entry(normalized_word).or_insert_with(Vec::new);
                        entry.extend(normalized_synonyms.into_iter());
                    }
                }
                // Make sure that we don't have duplicate synonyms.
@ -1422,6 +1427,43 @@ mod tests {
        assert!(result.documents_ids.is_empty());
    }
    #[test]
    fn thai_synonyms() {
        let mut index = TempIndex::new();
        index.index_documents_config.autogenerate_docids = true;
        let mut wtxn = index.write_txn().unwrap();
        // Send 3 documents with ids from 1 to 3.
        index
            .add_documents_using_wtxn(
                &mut wtxn,
                documents!([
                    { "name": "ยี่ปุ่น" },
                    { "name": "ญี่ปุ่น" },
                ]),
            )
            .unwrap();
        // In the same transaction provide some synonyms
        index
            .update_settings_using_wtxn(&mut wtxn, |settings| {
                settings.set_synonyms(btreemap! {
                    "japanese".to_string() => vec![S("ญี่ปุ่น"), S("ยี่ปุ่น")],
                });
            })
            .unwrap();
        wtxn.commit().unwrap();
        // Ensure synonyms are effectively stored
        let rtxn = index.read_txn().unwrap();
        let synonyms = index.synonyms(&rtxn).unwrap();
        assert!(!synonyms.is_empty()); // at this point the index should return something
        // Check that we can use synonyms
        let result = index.search(&rtxn).query("japanese").execute().unwrap();
        assert_eq!(result.documents_ids.len(), 2);
    }
    #[test]
    fn setting_searchable_recomputes_other_settings() {
        let index = TempIndex::new();
--- a/permissive-json-pointer/src/lib.rs
+++ b/permissive-json-pointer/src/lib.rs
@ -186,12 +186,16 @@ fn create_value(value: &Document, mut selectors: HashSet<&str>) -> Document {
                    let array = create_array(array, &sub_selectors);
                    if !array.is_empty() {
                        new_value.insert(key.to_string(), array.into());
                    } else {
                        new_value.insert(key.to_string(), Value::Array(vec![]));
                    }
                }
                Value::Object(object) => {
                    let object = create_value(object, sub_selectors);
                    if !object.is_empty() {
                        new_value.insert(key.to_string(), object.into());
                    } else {
                        new_value.insert(key.to_string(), Value::Object(Map::new()));
                    }
                }
                _ => (),
@ -211,6 +215,8 @@ fn create_array(array: &[Value], selectors: &HashSet<&str>) -> Vec<Value> {
                let array = create_array(array, selectors);
                if !array.is_empty() {
                    res.push(array.into());
                } else {
                    res.push(Value::Array(vec![]));
                }
            }
            Value::Object(object) => {
@ -637,6 +643,24 @@ mod tests {
        );
    }
    #[test]
    fn empty_array_object_return_empty() {
        let value: Value = json!({
            "array": [],
            "object": {},
        });
        let value: &Document = value.as_object().unwrap();
        let res: Value = select_values(value, vec!["array.name", "object.name"]).into();
        assert_eq!(
            res,
            json!({
                "array": [],
                "object": {},
            })
        );
    }
    #[test]
    fn all_conflict_variation() {
        let value: Value = json!({