From 0dd321afc70330f730c8bc89fef9964fa479c2fb Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 14 Nov 2024 10:02:51 +0100 Subject: [PATCH 1/6] reproduce #4984 --- crates/meilisearch/tests/search/mod.rs | 74 ++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) diff --git a/crates/meilisearch/tests/search/mod.rs b/crates/meilisearch/tests/search/mod.rs index d1091d944..1dc406fb3 100644 --- a/crates/meilisearch/tests/search/mod.rs +++ b/crates/meilisearch/tests/search/mod.rs @@ -15,6 +15,7 @@ mod pagination; mod restrict_searchable; mod search_queue; +use meili_snap::{json_string, snapshot}; use meilisearch::Opt; use tempfile::TempDir; @@ -62,6 +63,79 @@ async fn simple_search() { .await; } +#[actix_rt::test] +async fn search_with_stop_word() { + // related to https://github.com/meilisearch/meilisearch/issues/4984 + let server = Server::new().await; + let index = server.index("test"); + + let (_, code) = + index.update_settings(json!({"stopWords": ["the", "a", "an", "to", "in", "of"]})).await; + meili_snap::snapshot!(code, @"202 Accepted"); + + let documents = DOCUMENTS.clone(); + index.add_documents(documents, None).await; + index.wait_task(1).await; + + // prefix search + index + .search(json!({"q": "to the", "attributesToHighlight": ["title"], "attributesToRetrieve": ["title"] }), |response, code| { + assert_eq!(code, 200, "{}", response); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "title": "How to Train Your Dragon: The Hidden World", + "_formatted": { + "title": "How to Train Your Dragon: The Hidden World" + } + } + ] + "###); + }) + .await; + + // non-prefix search + index + .search(json!({"q": "to the ", "attributesToHighlight": ["title"], "attributesToRetrieve": ["title"] }), |response, code| { + assert_eq!(code, 200, "{}", response); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "title": "Shazam!", + "_formatted": { + "title": "Shazam!" + } + }, + { + "title": "Captain Marvel", + "_formatted": { + "title": "Captain Marvel" + } + }, + { + "title": "Escape Room", + "_formatted": { + "title": "Escape Room" + } + }, + { + "title": "How to Train Your Dragon: The Hidden World", + "_formatted": { + "title": "How to Train Your Dragon: The Hidden World" + } + }, + { + "title": "Gläss", + "_formatted": { + "title": "Gläss" + } + } + ] + "###); + }) + .await; +} + #[actix_rt::test] async fn phrase_search_with_stop_word() { // related to https://github.com/meilisearch/meilisearch/issues/3521 From 72ba35349887e64f0ae69079c8b27de21d141ba8 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 18 Nov 2024 10:03:23 +0100 Subject: [PATCH 2/6] reproduce sdk fail --- crates/meilisearch/tests/search/formatted.rs | 52 ++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/crates/meilisearch/tests/search/formatted.rs b/crates/meilisearch/tests/search/formatted.rs index 1484b6393..c549cd79d 100644 --- a/crates/meilisearch/tests/search/formatted.rs +++ b/crates/meilisearch/tests/search/formatted.rs @@ -4,6 +4,58 @@ use super::*; use crate::common::Server; use crate::json; +#[actix_rt::test] +async fn search_formatted_from_sdk() { + let server = Server::new_shared(); + let index = server.unique_index(); + + index + .update_settings( + json!({ "filterableAttributes": ["genre"], "searchableAttributes": ["title"] }), + ) + .await; + + let documents = json!([ + { "id": 123, "title": "Pride and Prejudice", "genre": "romance" }, + { "id": 456, "title": "Le Petit Prince", "genre": "adventure" }, + { "id": 1, "title": "Alice In Wonderland", "genre": "adventure" }, + { "id": 2, "title": "Le Rouge et le Noir", "genre": "romance" }, + { "id": 1344, "title": "The Hobbit", "genre": "adventure" }, + { "id": 4, "title": "Harry Potter and the Half-Blood Prince", "genre": "fantasy" }, + { "id": 7, "title": "Harry Potter and the Chamber of Secrets", "genre": "fantasy" }, + { "id": 42, "title": "The Hitchhiker's Guide to the Galaxy" } + ]); + let (response, _) = index.add_documents(documents, None).await; + index.wait_task(response.uid()).await; + + index + .search( + json!({ "q":"prince", + "attributesToCrop": ["title"], + "cropLength": 2, + "filter": "genre = adventure", + "attributesToHighlight": ["title"], + "attributesToRetrieve": ["title"] + }), + |response, code| { + assert_eq!(code, 200, "{}", response); + allow_duplicates! { + assert_json_snapshot!(response["hits"][0], + { "._rankingScore" => "[score]" }, + @r###" + { + "title": "Le Petit Prince", + "_formatted": { + "title": "…Petit Prince" + } + } + "###); + } + }, + ) + .await; +} + #[actix_rt::test] async fn formatted_contain_wildcard() { let server = Server::new_shared(); From cd796b0f4b3323c74190cc862bcd95ab3abec318 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 18 Nov 2024 11:46:00 +0100 Subject: [PATCH 3/6] Fix SDK test --- crates/milli/src/search/new/matches/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/milli/src/search/new/matches/mod.rs b/crates/milli/src/search/new/matches/mod.rs index 80e3ec7b2..c1fb18cfa 100644 --- a/crates/milli/src/search/new/matches/mod.rs +++ b/crates/milli/src/search/new/matches/mod.rs @@ -268,7 +268,7 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { last_match_last_token_position_plus_one } else { // we have matched the end of possible tokens, there's nothing to advance - tokens.len() - 1 + tokens.len() } }; From e0c3f3d560acd3d1cc67f09656c5594c47e1603f Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 18 Nov 2024 16:08:53 +0100 Subject: [PATCH 4/6] Fix #4984 --- crates/meilisearch/tests/search/mod.rs | 16 ++++------------ .../extract/extract_docid_word_positions.rs | 8 ++++---- 2 files changed, 8 insertions(+), 16 deletions(-) diff --git a/crates/meilisearch/tests/search/mod.rs b/crates/meilisearch/tests/search/mod.rs index 1dc406fb3..f3c11e451 100644 --- a/crates/meilisearch/tests/search/mod.rs +++ b/crates/meilisearch/tests/search/mod.rs @@ -69,8 +69,9 @@ async fn search_with_stop_word() { let server = Server::new().await; let index = server.index("test"); - let (_, code) = - index.update_settings(json!({"stopWords": ["the", "a", "an", "to", "in", "of"]})).await; + let (_, code) = index + .update_settings(json!({"stopWords": ["the", "The", "a", "an", "to", "in", "of"]})) + .await; meili_snap::snapshot!(code, @"202 Accepted"); let documents = DOCUMENTS.clone(); @@ -81,16 +82,7 @@ async fn search_with_stop_word() { index .search(json!({"q": "to the", "attributesToHighlight": ["title"], "attributesToRetrieve": ["title"] }), |response, code| { assert_eq!(code, 200, "{}", response); - snapshot!(json_string!(response["hits"]), @r###" - [ - { - "title": "How to Train Your Dragon: The Hidden World", - "_formatted": { - "title": "How to Train Your Dragon: The Hidden World" - } - } - ] - "###); + snapshot!(json_string!(response["hits"]), @"[]"); }) .await; diff --git a/crates/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/crates/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index ba11ceeb3..16ea92fa4 100644 --- a/crates/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/crates/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -57,9 +57,9 @@ pub fn extract_docid_word_positions( .map(|s| s.iter().map(String::as_str).collect()); let old_dictionary: Option> = settings_diff.old.dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect()); - let del_builder = + let mut del_builder = tokenizer_builder(old_stop_words, old_separators.as_deref(), old_dictionary.as_deref()); - let del_tokenizer = del_builder.into_tokenizer(); + let del_tokenizer = del_builder.build(); let new_stop_words = settings_diff.new.stop_words.as_ref(); let new_separators: Option> = settings_diff @@ -69,9 +69,9 @@ pub fn extract_docid_word_positions( .map(|s| s.iter().map(String::as_str).collect()); let new_dictionary: Option> = settings_diff.new.dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect()); - let add_builder = + let mut add_builder = tokenizer_builder(new_stop_words, new_separators.as_deref(), new_dictionary.as_deref()); - let add_tokenizer = add_builder.into_tokenizer(); + let add_tokenizer = add_builder.build(); // iterate over documents. let mut cursor = obkv_documents.into_cursor()?; From 8924d486dba1d48be642cd58830cbf7a2f46c515 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 3 Oct 2024 12:04:59 +0200 Subject: [PATCH 5/6] Add a test reproducing the bug --- .../tests/search/restrict_searchable.rs | 47 +++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/crates/meilisearch/tests/search/restrict_searchable.rs b/crates/meilisearch/tests/search/restrict_searchable.rs index ca659c518..abd13fadf 100644 --- a/crates/meilisearch/tests/search/restrict_searchable.rs +++ b/crates/meilisearch/tests/search/restrict_searchable.rs @@ -367,3 +367,50 @@ async fn search_on_exact_field() { }) .await; } + +#[actix_rt::test] +async fn phrase_search_on_title() { + let server = Server::new().await; + let documents = json!([ + { "id": 8, "desc": "Document Review", "title": "Document Review Specialist II" }, + { "id": 5, "desc": "Document Review", "title": "Document Review Attorney" }, + { "id": 4, "desc": "Document Review", "title": "Document Review Manager - Cyber Incident Response (Remote)" }, + { "id": 3, "desc": "Document Review", "title": "Document Review Paralegal" }, + { "id": 2, "desc": "Document Review", "title": "Document Controller (Saudi National)" }, + { "id": 1, "desc": "Document Review", "title": "Document Reviewer" }, + { "id": 7, "desc": "Document Review", "title": "Document Review Specialist II" }, + { "id": 6, "desc": "Document Review", "title": "Document Review (Entry Level)" } + ]); + let index = index_with_documents(&server, &documents).await; + + index + .search( + json!({"q": "\"Document Review\"", "attributesToSearchOn": ["title"], "attributesToRetrieve": ["title"]}), + |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "title": "Document Review Specialist II" + }, + { + "title": "Document Review Attorney" + }, + { + "title": "Document Review Manager - Cyber Incident Response (Remote)" + }, + { + "title": "Document Review Paralegal" + }, + { + "title": "Document Review Specialist II" + }, + { + "title": "Document Review (Entry Level)" + } + ] + "###); + }, + ) + .await; +} From 510ca999962e898c042c08682b5186ff3a3b1e71 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 18 Nov 2024 12:28:03 +0100 Subject: [PATCH 6/6] Fixes #4974 --- crates/milli/src/search/new/resolve_query_graph.rs | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/crates/milli/src/search/new/resolve_query_graph.rs b/crates/milli/src/search/new/resolve_query_graph.rs index 7a47b0a66..4496f8c65 100644 --- a/crates/milli/src/search/new/resolve_query_graph.rs +++ b/crates/milli/src/search/new/resolve_query_graph.rs @@ -193,15 +193,23 @@ pub fn compute_phrase_docids( if words.is_empty() { return Ok(RoaringBitmap::new()); } - let mut candidates = RoaringBitmap::new(); + let mut candidates = None; for word in words.iter().flatten().copied() { if let Some(word_docids) = ctx.word_docids(None, Word::Original(word))? { - candidates |= word_docids; + if let Some(candidates) = candidates.as_mut() { + *candidates &= word_docids; + } else { + candidates = Some(word_docids); + } } else { return Ok(RoaringBitmap::new()); } } + let Some(mut candidates) = candidates else { + return Ok(RoaringBitmap::new()); + }; + let winsize = words.len().min(3); for win in words.windows(winsize) {