Merge 510ca99996 into c1d8ee2a8d

Fixes #4974
Add a test reproducing the bug
2024-11-22 18:17:39 +08:00 · 2024-11-18 18:47:07 +01:00 · 2024-11-18 16:08:55 +01:00 · 2024-11-18 16:08:55 +01:00 · 2024-11-18 16:08:53 +01:00 · 2024-11-18 11:46:00 +01:00
6 changed files with 180 additions and 7 deletions
--- a/crates/meilisearch/tests/search/formatted.rs
+++ b/crates/meilisearch/tests/search/formatted.rs
@ -4,6 +4,58 @@ use super::*;
 use crate::common::Server;
 use crate::json;
 #[actix_rt::test]
 async fn search_formatted_from_sdk() {
    let server = Server::new_shared();
    let index = server.unique_index();
    index
        .update_settings(
            json!({ "filterableAttributes": ["genre"], "searchableAttributes": ["title"] }),
        )
        .await;
    let documents = json!([
      { "id": 123,  "title": "Pride and Prejudice",                     "genre": "romance" },
      { "id": 456,  "title": "Le Petit Prince",                         "genre": "adventure" },
      { "id": 1,    "title": "Alice In Wonderland",                     "genre": "adventure" },
      { "id": 2,    "title": "Le Rouge et le Noir",                     "genre": "romance" },
      { "id": 1344, "title": "The Hobbit",                              "genre": "adventure" },
      { "id": 4,    "title": "Harry Potter and the Half-Blood Prince",  "genre": "fantasy" },
      { "id": 7,    "title": "Harry Potter and the Chamber of Secrets", "genre": "fantasy" },
      { "id": 42,   "title": "The Hitchhiker's Guide to the Galaxy" }
    ]);
    let (response, _) = index.add_documents(documents, None).await;
    index.wait_task(response.uid()).await;
    index
        .search(
            json!({ "q":"prince",
              "attributesToCrop": ["title"],
              "cropLength": 2,
              "filter": "genre = adventure",
              "attributesToHighlight": ["title"],
              "attributesToRetrieve": ["title"]
            }),
            |response, code| {
                assert_eq!(code, 200, "{}", response);
                allow_duplicates! {
                  assert_json_snapshot!(response["hits"][0],
                        { "._rankingScore" => "[score]" },
                        @r###"
                  {
                    "title": "Le Petit Prince",
                    "_formatted": {
                      "title": "…Petit <em>Prince</em>"
                    }
                  }
                  "###);
                }
            },
        )
        .await;
 }
 #[actix_rt::test]
 async fn formatted_contain_wildcard() {
    let server = Server::new_shared();
--- a/crates/meilisearch/tests/search/mod.rs
+++ b/crates/meilisearch/tests/search/mod.rs
@ -15,6 +15,7 @@ mod pagination;
 mod restrict_searchable;
 mod search_queue;
 use meili_snap::{json_string, snapshot};
 use meilisearch::Opt;
 use tempfile::TempDir;
@ -62,6 +63,71 @@ async fn simple_search() {
        .await;
 }
 #[actix_rt::test]
 async fn search_with_stop_word() {
    // related to https://github.com/meilisearch/meilisearch/issues/4984
    let server = Server::new().await;
    let index = server.index("test");
    let (_, code) = index
        .update_settings(json!({"stopWords": ["the", "The", "a", "an", "to", "in", "of"]}))
        .await;
    meili_snap::snapshot!(code, @"202 Accepted");
    let documents = DOCUMENTS.clone();
    index.add_documents(documents, None).await;
    index.wait_task(1).await;
    // prefix search
    index
        .search(json!({"q": "to the", "attributesToHighlight": ["title"], "attributesToRetrieve": ["title"] }), |response, code| {
            assert_eq!(code, 200, "{}", response);
            snapshot!(json_string!(response["hits"]), @"[]");
        })
        .await;
    // non-prefix search
    index
          .search(json!({"q": "to the ", "attributesToHighlight": ["title"], "attributesToRetrieve": ["title"] }), |response, code| {
              assert_eq!(code, 200, "{}", response);
              snapshot!(json_string!(response["hits"]), @r###"
              [
                {
                  "title": "Shazam!",
                  "_formatted": {
                    "title": "Shazam!"
                  }
                },
                {
                  "title": "Captain Marvel",
                  "_formatted": {
                    "title": "Captain Marvel"
                  }
                },
                {
                  "title": "Escape Room",
                  "_formatted": {
                    "title": "Escape Room"
                  }
                },
                {
                  "title": "How to Train Your Dragon: The Hidden World",
                  "_formatted": {
                    "title": "How to Train Your Dragon: The Hidden World"
                  }
                },
                {
                  "title": "Gläss",
                  "_formatted": {
                    "title": "Gläss"
                  }
                }
              ]
              "###);
          })
          .await;
 }
 #[actix_rt::test]
 async fn phrase_search_with_stop_word() {
    // related to https://github.com/meilisearch/meilisearch/issues/3521
--- a/crates/meilisearch/tests/search/restrict_searchable.rs
+++ b/crates/meilisearch/tests/search/restrict_searchable.rs
@ -367,3 +367,50 @@ async fn search_on_exact_field() {
        })
        .await;
 }
 #[actix_rt::test]
 async fn phrase_search_on_title() {
    let server = Server::new().await;
    let documents = json!([
      { "id": 8, "desc": "Document Review", "title": "Document Review Specialist II" },
      { "id": 5, "desc": "Document Review", "title": "Document Review Attorney" },
      { "id": 4, "desc": "Document Review", "title": "Document Review Manager - Cyber Incident Response (Remote)" },
      { "id": 3, "desc": "Document Review", "title": "Document Review Paralegal" },
      { "id": 2, "desc": "Document Review", "title": "Document Controller (Saudi National)" },
      { "id": 1, "desc": "Document Review", "title": "Document Reviewer" },
      { "id": 7, "desc": "Document Review", "title": "Document Review Specialist II" },
      { "id": 6, "desc": "Document Review", "title": "Document Review (Entry Level)" }
    ]);
    let index = index_with_documents(&server, &documents).await;
    index
        .search(
            json!({"q": "\"Document Review\"", "attributesToSearchOn": ["title"], "attributesToRetrieve": ["title"]}),
            |response, code| {
                snapshot!(code, @"200 OK");
                snapshot!(json_string!(response["hits"]), @r###"
                [
                  {
                    "title": "Document Review Specialist II"
                  },
                  {
                    "title": "Document Review Attorney"
                  },
                  {
                    "title": "Document Review Manager - Cyber Incident Response (Remote)"
                  },
                  {
                    "title": "Document Review Paralegal"
                  },
                  {
                    "title": "Document Review Specialist II"
                  },
                  {
                    "title": "Document Review (Entry Level)"
                  }
                ]
                "###);
            },
        )
        .await;
 }
--- a/crates/milli/src/search/new/matches/mod.rs
+++ b/crates/milli/src/search/new/matches/mod.rs
@ -268,7 +268,7 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
                    last_match_last_token_position_plus_one
                } else {
                    // we have matched the end of possible tokens, there's nothing to advance
-                    tokens.len() - 1
+                    tokens.len()
                }
            };
--- a/crates/milli/src/search/new/resolve_query_graph.rs
+++ b/crates/milli/src/search/new/resolve_query_graph.rs
@ -193,15 +193,23 @@ pub fn compute_phrase_docids(
    if words.is_empty() {
        return Ok(RoaringBitmap::new());
    }
-    let mut candidates = RoaringBitmap::new();
+    let mut candidates = None;
    for word in words.iter().flatten().copied() {
        if let Some(word_docids) = ctx.word_docids(None, Word::Original(word))? {
-            candidates |= word_docids;
+            if let Some(candidates) = candidates.as_mut() {
                *candidates &= word_docids;
            } else {
                candidates = Some(word_docids);
            }
        } else {
            return Ok(RoaringBitmap::new());
        }
    }
    let Some(mut candidates) = candidates else {
        return Ok(RoaringBitmap::new());
    };
    let winsize = words.len().min(3);
    for win in words.windows(winsize) {
--- a/crates/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
+++ b/crates/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
@ -57,9 +57,9 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
        .map(|s| s.iter().map(String::as_str).collect());
    let old_dictionary: Option<Vec<_>> =
        settings_diff.old.dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect());
-    let del_builder =
+    let mut del_builder =
        tokenizer_builder(old_stop_words, old_separators.as_deref(), old_dictionary.as_deref());
-    let del_tokenizer = del_builder.into_tokenizer();
+    let del_tokenizer = del_builder.build();
    let new_stop_words = settings_diff.new.stop_words.as_ref();
    let new_separators: Option<Vec<_>> = settings_diff
@ -69,9 +69,9 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
        .map(|s| s.iter().map(String::as_str).collect());
    let new_dictionary: Option<Vec<_>> =
        settings_diff.new.dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect());
-    let add_builder =
+    let mut add_builder =
        tokenizer_builder(new_stop_words, new_separators.as_deref(), new_dictionary.as_deref());
-    let add_tokenizer = add_builder.into_tokenizer();
+    let add_tokenizer = add_builder.build();
    // iterate over documents.
    let mut cursor = obkv_documents.into_cursor()?;
Author	SHA1	Message	Date
Many the fish	673b949b7a	Merge `510ca99996` into `c1d8ee2a8d`	2024-11-18 18:47:07 +01:00
ManyTheFish	510ca99996	Fixes #4974	2024-11-18 16:08:55 +01:00
ManyTheFish	8924d486db	Add a test reproducing the bug	2024-11-18 16:08:55 +01:00
ManyTheFish	e0c3f3d560	Fix #4984	2024-11-18 16:08:53 +01:00
ManyTheFish	cd796b0f4b	Fix SDK test	2024-11-18 11:46:00 +01:00
ManyTheFish	72ba353498	reproduce sdk fail	2024-11-18 10:03:23 +01:00
ManyTheFish	0dd321afc7	reproduce #4984	2024-11-14 10:02:51 +01:00