diff --git a/meilisearch/src/search.rs b/meilisearch/src/search.rs index 1a1c7721e..ab88cb671 100644 --- a/meilisearch/src/search.rs +++ b/meilisearch/src/search.rs @@ -491,6 +491,20 @@ pub fn perform_search( tokenizer_builder.allow_list(&script_lang_map); } + let separators = index.allowed_separators(&rtxn)?; + let separators: Option> = + separators.as_ref().map(|x| x.iter().map(String::as_str).collect()); + if let Some(ref separators) = separators { + tokenizer_builder.separators(separators); + } + + let dictionary = index.dictionary(&rtxn)?; + let dictionary: Option> = + dictionary.as_ref().map(|x| x.iter().map(String::as_str).collect()); + if let Some(ref dictionary) = dictionary { + tokenizer_builder.words_dict(dictionary); + } + let mut formatter_builder = MatcherBuilder::new(matching_words, tokenizer_builder.build()); formatter_builder.crop_marker(query.crop_marker); formatter_builder.highlight_prefix(query.highlight_pre_tag); diff --git a/meilisearch/tests/settings/tokenizer_customization.rs b/meilisearch/tests/settings/tokenizer_customization.rs index e622d9a3c..75bea560b 100644 --- a/meilisearch/tests/settings/tokenizer_customization.rs +++ b/meilisearch/tests/settings/tokenizer_customization.rs @@ -52,3 +52,145 @@ async fn set_and_reset() { snapshot!(json_string!(response["separatorTokens"]), @"[]"); snapshot!(json_string!(response["dictionary"]), @"[]"); } + +#[actix_rt::test] +async fn set_and_search() { + let documents = json!([ + { + "id": 1, + "content": "Mac & cheese", + }, + { + "id": 2, + "content": "G#D#G#D#G#C#D#G#C#", + }, + { + "id": 3, + "content": "Mac&sep&&sepcheese", + }, + ]); + + let server = Server::new().await; + let index = server.index("test"); + + index.add_documents(documents, None).await; + index.wait_task(0).await; + + let (_response, _code) = index + .update_settings(json!({ + "nonSeparatorTokens": ["#", "&"], + "separatorTokens": ["
", "&sep"], + "dictionary": ["#", "A#", "B#", "C#", "D#", "E#", "F#", "G#"], + })) + .await; + index.wait_task(1).await; + + index + .search(json!({"q": "&", "attributesToHighlight": ["content"]}), |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 1, + "content": "Mac & cheese", + "_formatted": { + "id": "1", + "content": "Mac & cheese" + } + }, + { + "id": 3, + "content": "Mac&sep&&sepcheese", + "_formatted": { + "id": "3", + "content": "Mac&sep&&sepcheese" + } + } + ] + "###); + }) + .await; + + index + .search( + json!({"q": "Mac & cheese", "attributesToHighlight": ["content"]}), + |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 1, + "content": "Mac & cheese", + "_formatted": { + "id": "1", + "content": "Mac & cheese" + } + }, + { + "id": 3, + "content": "Mac&sep&&sepcheese", + "_formatted": { + "id": "3", + "content": "Mac&sep&&sepcheese" + } + } + ] + "###); + }, + ) + .await; + + index + .search( + json!({"q": "Mac&sep&&sepcheese", "attributesToHighlight": ["content"]}), + |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 1, + "content": "Mac & cheese", + "_formatted": { + "id": "1", + "content": "Mac & cheese" + } + }, + { + "id": 3, + "content": "Mac&sep&&sepcheese", + "_formatted": { + "id": "3", + "content": "Mac&sep&&sepcheese" + } + } + ] + "###); + }, + ) + .await; + + index + .search(json!({"q": "C#D#G", "attributesToHighlight": ["content"]}), |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 2, + "content": "G#D#G#D#G#C#D#G#C#", + "_formatted": { + "id": "2", + "content": "G#D#G#D#G#C#D#G#C#" + } + } + ] + "###); + }) + .await; + + index + .search(json!({"q": "#", "attributesToHighlight": ["content"]}), |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @"[]"); + }) + .await; +} diff --git a/milli/src/index.rs b/milli/src/index.rs index ea0120769..68014cc1a 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -1094,10 +1094,7 @@ impl Index { /* separators easing method */ - pub(crate) fn allowed_separators<'t>( - &self, - rtxn: &'t RoTxn, - ) -> Result>> { + pub fn allowed_separators<'t>(&self, rtxn: &'t RoTxn) -> Result>> { let default_separators = charabia::separators::DEFAULT_SEPARATORS.iter().map(|s| s.to_string()); let mut separators: Option> = None; diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 034b279ad..8868d23fd 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -479,6 +479,20 @@ pub fn execute_search( tokbuilder.stop_words(stop_words); } + let separators = ctx.index.allowed_separators(ctx.txn)?; + let separators: Option> = + separators.as_ref().map(|x| x.iter().map(String::as_str).collect()); + if let Some(ref separators) = separators { + tokbuilder.separators(separators); + } + + let dictionary = ctx.index.dictionary(ctx.txn)?; + let dictionary: Option> = + dictionary.as_ref().map(|x| x.iter().map(String::as_str).collect()); + if let Some(ref dictionary) = dictionary { + tokbuilder.words_dict(dictionary); + } + let script_lang_map = ctx.index.script_language(ctx.txn)?; if !script_lang_map.is_empty() { tokbuilder.allow_list(&script_lang_map); diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index f726bf866..b56398385 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -28,6 +28,8 @@ pub fn extract_docid_word_positions( indexer: GrenadParameters, searchable_fields: &Option>, stop_words: Option<&fst::Set<&[u8]>>, + allowed_separators: Option<&Vec<&str>>, + dictionary: Option<&Vec<&str>>, max_positions_per_attributes: Option, ) -> Result<(RoaringBitmap, grenad::Reader, ScriptLanguageDocidsMap)> { puffin::profile_function!(); @@ -52,6 +54,14 @@ pub fn extract_docid_word_positions( if let Some(stop_words) = stop_words { tokenizer_builder.stop_words(stop_words); } + if let Some(dictionary) = dictionary { + // let dictionary: Vec<_> = dictionary.iter().map(String::as_str).collect(); + tokenizer_builder.words_dict(dictionary.as_slice()); + } + if let Some(separators) = allowed_separators { + // let separators: Vec<_> = separators.iter().map(String::as_str).collect(); + tokenizer_builder.separators(separators.as_slice()); + } let tokenizer = tokenizer_builder.build(); let mut cursor = obkv_documents.into_cursor()?; diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 1b1dc1420..cec0d5814 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -49,6 +49,8 @@ pub(crate) fn data_from_obkv_documents( geo_fields_ids: Option<(FieldId, FieldId)>, vectors_field_id: Option, stop_words: Option>, + allowed_separators: Option>, + dictionary: Option>, max_positions_per_attributes: Option, exact_attributes: HashSet, ) -> Result<()> { @@ -76,6 +78,8 @@ pub(crate) fn data_from_obkv_documents( geo_fields_ids, vectors_field_id, &stop_words, + &allowed_separators, + &dictionary, max_positions_per_attributes, ) }) @@ -289,6 +293,8 @@ fn send_and_extract_flattened_documents_data( geo_fields_ids: Option<(FieldId, FieldId)>, vectors_field_id: Option, stop_words: &Option>, + allowed_separators: &Option>, + dictionary: &Option>, max_positions_per_attributes: Option, ) -> Result<( grenad::Reader, @@ -344,6 +350,8 @@ fn send_and_extract_flattened_documents_data( indexer, searchable_fields, stop_words.as_ref(), + allowed_separators.as_ref(), + dictionary.as_ref(), max_positions_per_attributes, )?; diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 1b2aab827..9a657674e 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -316,6 +316,12 @@ where let vectors_field_id = self.index.fields_ids_map(self.wtxn)?.id("_vectors"); let stop_words = self.index.stop_words(self.wtxn)?; + let separators = self.index.allowed_separators(self.wtxn)?; + let separators: Option> = + separators.as_ref().map(|x| x.iter().map(String::as_str).collect()); + let dictionary = self.index.dictionary(self.wtxn)?; + let dictionary: Option> = + dictionary.as_ref().map(|x| x.iter().map(String::as_str).collect()); let exact_attributes = self.index.exact_attributes_ids(self.wtxn)?; let pool_params = GrenadParameters { @@ -353,6 +359,8 @@ where geo_fields_ids, vectors_field_id, stop_words, + separators, + dictionary, max_positions_per_attributes, exact_attributes, )