diff --git a/crates/meilisearch/tests/search/formatted.rs b/crates/meilisearch/tests/search/formatted.rs index ee33939fd..5ded39976 100644 --- a/crates/meilisearch/tests/search/formatted.rs +++ b/crates/meilisearch/tests/search/formatted.rs @@ -4,6 +4,58 @@ use super::*; use crate::common::Server; use crate::json; +#[actix_rt::test] +async fn search_formatted_from_sdk() { + let server = Server::new_shared(); + let index = server.unique_index(); + + index + .update_settings( + json!({ "filterableAttributes": ["genre"], "searchableAttributes": ["title"] }), + ) + .await; + + let documents = json!([ + { "id": 123, "title": "Pride and Prejudice", "genre": "romance" }, + { "id": 456, "title": "Le Petit Prince", "genre": "adventure" }, + { "id": 1, "title": "Alice In Wonderland", "genre": "adventure" }, + { "id": 2, "title": "Le Rouge et le Noir", "genre": "romance" }, + { "id": 1344, "title": "The Hobbit", "genre": "adventure" }, + { "id": 4, "title": "Harry Potter and the Half-Blood Prince", "genre": "fantasy" }, + { "id": 7, "title": "Harry Potter and the Chamber of Secrets", "genre": "fantasy" }, + { "id": 42, "title": "The Hitchhiker's Guide to the Galaxy" } + ]); + let (response, _) = index.add_documents(documents, None).await; + index.wait_task(response.uid()).await; + + index + .search( + json!({ "q":"prince", + "attributesToCrop": ["title"], + "cropLength": 2, + "filter": "genre = adventure", + "attributesToHighlight": ["title"], + "attributesToRetrieve": ["title"] + }), + |response, code| { + assert_eq!(code, 200, "{}", response); + allow_duplicates! { + assert_json_snapshot!(response["hits"][0], + { "._rankingScore" => "[score]" }, + @r###" + { + "title": "Le Petit Prince", + "_formatted": { + "title": "…Petit Prince" + } + } + "###); + } + }, + ) + .await; +} + #[actix_rt::test] async fn formatted_contain_wildcard() { let server = Server::new_shared(); diff --git a/crates/meilisearch/tests/search/mod.rs b/crates/meilisearch/tests/search/mod.rs index 8cafe1dd8..057b2b3a2 100644 --- a/crates/meilisearch/tests/search/mod.rs +++ b/crates/meilisearch/tests/search/mod.rs @@ -15,6 +15,7 @@ mod pagination; mod restrict_searchable; mod search_queue; +use meili_snap::{json_string, snapshot}; use meilisearch::Opt; use tempfile::TempDir; @@ -62,6 +63,71 @@ async fn simple_search() { .await; } +#[actix_rt::test] +async fn search_with_stop_word() { + // related to https://github.com/meilisearch/meilisearch/issues/4984 + let server = Server::new().await; + let index = server.index("test"); + + let (_, code) = index + .update_settings(json!({"stopWords": ["the", "The", "a", "an", "to", "in", "of"]})) + .await; + meili_snap::snapshot!(code, @"202 Accepted"); + + let documents = DOCUMENTS.clone(); + index.add_documents(documents, None).await; + index.wait_task(1).await; + + // prefix search + index + .search(json!({"q": "to the", "attributesToHighlight": ["title"], "attributesToRetrieve": ["title"] }), |response, code| { + assert_eq!(code, 200, "{}", response); + snapshot!(json_string!(response["hits"]), @"[]"); + }) + .await; + + // non-prefix search + index + .search(json!({"q": "to the ", "attributesToHighlight": ["title"], "attributesToRetrieve": ["title"] }), |response, code| { + assert_eq!(code, 200, "{}", response); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "title": "Shazam!", + "_formatted": { + "title": "Shazam!" + } + }, + { + "title": "Captain Marvel", + "_formatted": { + "title": "Captain Marvel" + } + }, + { + "title": "Escape Room", + "_formatted": { + "title": "Escape Room" + } + }, + { + "title": "How to Train Your Dragon: The Hidden World", + "_formatted": { + "title": "How to Train Your Dragon: The Hidden World" + } + }, + { + "title": "Gläss", + "_formatted": { + "title": "Gläss" + } + } + ] + "###); + }) + .await; +} + #[actix_rt::test] async fn phrase_search_with_stop_word() { // related to https://github.com/meilisearch/meilisearch/issues/3521 diff --git a/crates/meilisearch/tests/search/restrict_searchable.rs b/crates/meilisearch/tests/search/restrict_searchable.rs index ca659c518..abd13fadf 100644 --- a/crates/meilisearch/tests/search/restrict_searchable.rs +++ b/crates/meilisearch/tests/search/restrict_searchable.rs @@ -367,3 +367,50 @@ async fn search_on_exact_field() { }) .await; } + +#[actix_rt::test] +async fn phrase_search_on_title() { + let server = Server::new().await; + let documents = json!([ + { "id": 8, "desc": "Document Review", "title": "Document Review Specialist II" }, + { "id": 5, "desc": "Document Review", "title": "Document Review Attorney" }, + { "id": 4, "desc": "Document Review", "title": "Document Review Manager - Cyber Incident Response (Remote)" }, + { "id": 3, "desc": "Document Review", "title": "Document Review Paralegal" }, + { "id": 2, "desc": "Document Review", "title": "Document Controller (Saudi National)" }, + { "id": 1, "desc": "Document Review", "title": "Document Reviewer" }, + { "id": 7, "desc": "Document Review", "title": "Document Review Specialist II" }, + { "id": 6, "desc": "Document Review", "title": "Document Review (Entry Level)" } + ]); + let index = index_with_documents(&server, &documents).await; + + index + .search( + json!({"q": "\"Document Review\"", "attributesToSearchOn": ["title"], "attributesToRetrieve": ["title"]}), + |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "title": "Document Review Specialist II" + }, + { + "title": "Document Review Attorney" + }, + { + "title": "Document Review Manager - Cyber Incident Response (Remote)" + }, + { + "title": "Document Review Paralegal" + }, + { + "title": "Document Review Specialist II" + }, + { + "title": "Document Review (Entry Level)" + } + ] + "###); + }, + ) + .await; +} diff --git a/crates/milli/src/search/new/matches/mod.rs b/crates/milli/src/search/new/matches/mod.rs index d7bc27c94..ba92c9c86 100644 --- a/crates/milli/src/search/new/matches/mod.rs +++ b/crates/milli/src/search/new/matches/mod.rs @@ -274,7 +274,7 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { last_match_last_token_position_plus_one } else { // we have matched the end of possible tokens, there's nothing to advance - tokens.len() - 1 + tokens.len() } }; diff --git a/crates/milli/src/search/new/resolve_query_graph.rs b/crates/milli/src/search/new/resolve_query_graph.rs index 7a47b0a66..4496f8c65 100644 --- a/crates/milli/src/search/new/resolve_query_graph.rs +++ b/crates/milli/src/search/new/resolve_query_graph.rs @@ -193,15 +193,23 @@ pub fn compute_phrase_docids( if words.is_empty() { return Ok(RoaringBitmap::new()); } - let mut candidates = RoaringBitmap::new(); + let mut candidates = None; for word in words.iter().flatten().copied() { if let Some(word_docids) = ctx.word_docids(None, Word::Original(word))? { - candidates |= word_docids; + if let Some(candidates) = candidates.as_mut() { + *candidates &= word_docids; + } else { + candidates = Some(word_docids); + } } else { return Ok(RoaringBitmap::new()); } } + let Some(mut candidates) = candidates else { + return Ok(RoaringBitmap::new()); + }; + let winsize = words.len().min(3); for win in words.windows(winsize) { diff --git a/crates/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/crates/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index b1e6f24be..606ae6b54 100644 --- a/crates/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/crates/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -58,9 +58,9 @@ pub fn extract_docid_word_positions( .map(|s| s.iter().map(String::as_str).collect()); let old_dictionary: Option> = settings_diff.old.dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect()); - let del_builder = + let mut del_builder = tokenizer_builder(old_stop_words, old_separators.as_deref(), old_dictionary.as_deref()); - let del_tokenizer = del_builder.into_tokenizer(); + let del_tokenizer = del_builder.build(); let new_stop_words = settings_diff.new.stop_words.as_ref(); let new_separators: Option> = settings_diff @@ -70,9 +70,9 @@ pub fn extract_docid_word_positions( .map(|s| s.iter().map(String::as_str).collect()); let new_dictionary: Option> = settings_diff.new.dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect()); - let add_builder = + let mut add_builder = tokenizer_builder(new_stop_words, new_separators.as_deref(), new_dictionary.as_deref()); - let add_tokenizer = add_builder.into_tokenizer(); + let add_tokenizer = add_builder.build(); // iterate over documents. let mut cursor = obkv_documents.into_cursor()?;