Make the search and the indexing work

2025-02-20 17:45:54 +08:00 · 2023-07-24 18:35:20 +02:00 · 2023-07-24 18:35:20 +02:00 · 9c485f8563
commit 9c485f8563
parent d8d12d5979
7 changed files with 197 additions and 4 deletions
--- a/meilisearch/src/search.rs
+++ b/meilisearch/src/search.rs
@ -491,6 +491,20 @@ pub fn perform_search(
        tokenizer_builder.allow_list(&script_lang_map);
    }

+    let separators = index.allowed_separators(&rtxn)?;
+    let separators: Option<Vec<_>> =
+        separators.as_ref().map(|x| x.iter().map(String::as_str).collect());
+    if let Some(ref separators) = separators {
+        tokenizer_builder.separators(separators);
+    }
+
+    let dictionary = index.dictionary(&rtxn)?;
+    let dictionary: Option<Vec<_>> =
+        dictionary.as_ref().map(|x| x.iter().map(String::as_str).collect());
+    if let Some(ref dictionary) = dictionary {
+        tokenizer_builder.words_dict(dictionary);
+    }
+
    let mut formatter_builder = MatcherBuilder::new(matching_words, tokenizer_builder.build());
    formatter_builder.crop_marker(query.crop_marker);
    formatter_builder.highlight_prefix(query.highlight_pre_tag);
--- a/meilisearch/tests/settings/tokenizer_customization.rs
+++ b/meilisearch/tests/settings/tokenizer_customization.rs
@ -52,3 +52,145 @@ async fn set_and_reset() {
    snapshot!(json_string!(response["separatorTokens"]), @"[]");
    snapshot!(json_string!(response["dictionary"]), @"[]");
 }
+
+#[actix_rt::test]
+async fn set_and_search() {
+    let documents = json!([
+        {
+            "id": 1,
+            "content": "Mac & cheese",
+        },
+        {
+            "id": 2,
+            "content": "G#D#G#D#G#C#D#G#C#",
+        },
+        {
+            "id": 3,
+            "content": "Mac&sep&&sepcheese",
+        },
+    ]);
+
+    let server = Server::new().await;
+    let index = server.index("test");
+
+    index.add_documents(documents, None).await;
+    index.wait_task(0).await;
+
+    let (_response, _code) = index
+        .update_settings(json!({
+            "nonSeparatorTokens": ["#", "&"],
+            "separatorTokens": ["<br/>", "&sep"],
+            "dictionary": ["#", "A#", "B#", "C#", "D#", "E#", "F#", "G#"],
+        }))
+        .await;
+    index.wait_task(1).await;
+
+    index
+        .search(json!({"q": "&", "attributesToHighlight": ["content"]}), |response, code| {
+            snapshot!(code, @"200 OK");
+            snapshot!(json_string!(response["hits"]), @r###"
+            [
+              {
+                "id": 1,
+                "content": "Mac & cheese",
+                "_formatted": {
+                  "id": "1",
+                  "content": "Mac <em>&</em> cheese"
+                }
+              },
+              {
+                "id": 3,
+                "content": "Mac&sep&&sepcheese",
+                "_formatted": {
+                  "id": "3",
+                  "content": "Mac&sep<em>&</em>&sepcheese"
+                }
+              }
+            ]
+            "###);
+        })
+        .await;
+
+    index
+        .search(
+            json!({"q": "Mac & cheese", "attributesToHighlight": ["content"]}),
+            |response, code| {
+                snapshot!(code, @"200 OK");
+                snapshot!(json_string!(response["hits"]), @r###"
+                [
+                  {
+                    "id": 1,
+                    "content": "Mac & cheese",
+                    "_formatted": {
+                      "id": "1",
+                      "content": "<em>Mac</em> <em>&</em> <em>cheese</em>"
+                    }
+                  },
+                  {
+                    "id": 3,
+                    "content": "Mac&sep&&sepcheese",
+                    "_formatted": {
+                      "id": "3",
+                      "content": "<em>Mac</em>&sep<em>&</em>&sep<em>cheese</em>"
+                    }
+                  }
+                ]
+                "###);
+            },
+        )
+        .await;
+
+    index
+        .search(
+            json!({"q": "Mac&sep&&sepcheese", "attributesToHighlight": ["content"]}),
+            |response, code| {
+                snapshot!(code, @"200 OK");
+                snapshot!(json_string!(response["hits"]), @r###"
+                [
+                  {
+                    "id": 1,
+                    "content": "Mac & cheese",
+                    "_formatted": {
+                      "id": "1",
+                      "content": "<em>Mac</em> <em>&</em> <em>cheese</em>"
+                    }
+                  },
+                  {
+                    "id": 3,
+                    "content": "Mac&sep&&sepcheese",
+                    "_formatted": {
+                      "id": "3",
+                      "content": "<em>Mac</em>&sep<em>&</em>&sep<em>cheese</em>"
+                    }
+                  }
+                ]
+                "###);
+            },
+        )
+        .await;
+
+    index
+        .search(json!({"q": "C#D#G", "attributesToHighlight": ["content"]}), |response, code| {
+            snapshot!(code, @"200 OK");
+            snapshot!(json_string!(response["hits"]), @r###"
+            [
+              {
+                "id": 2,
+                "content": "G#D#G#D#G#C#D#G#C#",
+                "_formatted": {
+                  "id": "2",
+                  "content": "<em>G</em>#<em>D#</em><em>G</em>#<em>D#</em><em>G</em>#<em>C#</em><em>D#</em><em>G</em>#<em>C#</em>"
+                }
+              }
+            ]
+            "###);
+        })
+        .await;
+
+    index
+        .search(json!({"q": "#", "attributesToHighlight": ["content"]}), |response, code| {
+            snapshot!(code, @"200 OK");
+            snapshot!(json_string!(response["hits"]), @"[]");
+        })
+        .await;
+}
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@ -1094,10 +1094,7 @@ impl Index {

    /* separators easing method */

-    pub(crate) fn allowed_separators<'t>(
-        &self,
-        rtxn: &'t RoTxn,
-    ) -> Result<Option<BTreeSet<String>>> {
+    pub fn allowed_separators<'t>(&self, rtxn: &'t RoTxn) -> Result<Option<BTreeSet<String>>> {
        let default_separators =
            charabia::separators::DEFAULT_SEPARATORS.iter().map(|s| s.to_string());
        let mut separators: Option<BTreeSet<_>> = None;
--- a/milli/src/search/new/mod.rs
+++ b/milli/src/search/new/mod.rs
@ -479,6 +479,20 @@ pub fn execute_search(
            tokbuilder.stop_words(stop_words);
        }

+        let separators = ctx.index.allowed_separators(ctx.txn)?;
+        let separators: Option<Vec<_>> =
+            separators.as_ref().map(|x| x.iter().map(String::as_str).collect());
+        if let Some(ref separators) = separators {
+            tokbuilder.separators(separators);
+        }
+
+        let dictionary = ctx.index.dictionary(ctx.txn)?;
+        let dictionary: Option<Vec<_>> =
+            dictionary.as_ref().map(|x| x.iter().map(String::as_str).collect());
+        if let Some(ref dictionary) = dictionary {
+            tokbuilder.words_dict(dictionary);
+        }
+
        let script_lang_map = ctx.index.script_language(ctx.txn)?;
        if !script_lang_map.is_empty() {
            tokbuilder.allow_list(&script_lang_map);
--- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
+++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
@ -28,6 +28,8 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
    indexer: GrenadParameters,
    searchable_fields: &Option<HashSet<FieldId>>,
    stop_words: Option<&fst::Set<&[u8]>>,
+    allowed_separators: Option<&Vec<&str>>,
+    dictionary: Option<&Vec<&str>>,
    max_positions_per_attributes: Option<u32>,
 ) -> Result<(RoaringBitmap, grenad::Reader<File>, ScriptLanguageDocidsMap)> {
    puffin::profile_function!();
@ -52,6 +54,14 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
    if let Some(stop_words) = stop_words {
        tokenizer_builder.stop_words(stop_words);
    }
+    if let Some(dictionary) = dictionary {
+        // let dictionary: Vec<_> = dictionary.iter().map(String::as_str).collect();
+        tokenizer_builder.words_dict(dictionary.as_slice());
+    }
+    if let Some(separators) = allowed_separators {
+        // let separators: Vec<_> = separators.iter().map(String::as_str).collect();
+        tokenizer_builder.separators(separators.as_slice());
+    }
    let tokenizer = tokenizer_builder.build();

    let mut cursor = obkv_documents.into_cursor()?;
--- a/milli/src/update/index_documents/extract/mod.rs
+++ b/milli/src/update/index_documents/extract/mod.rs
@ -49,6 +49,8 @@ pub(crate) fn data_from_obkv_documents(
    geo_fields_ids: Option<(FieldId, FieldId)>,
    vectors_field_id: Option<FieldId>,
    stop_words: Option<fst::Set<&[u8]>>,
+    allowed_separators: Option<Vec<&str>>,
+    dictionary: Option<Vec<&str>>,
    max_positions_per_attributes: Option<u32>,
    exact_attributes: HashSet<FieldId>,
 ) -> Result<()> {
@ -76,6 +78,8 @@ pub(crate) fn data_from_obkv_documents(
                    geo_fields_ids,
                    vectors_field_id,
                    &stop_words,
+                    &allowed_separators,
+                    &dictionary,
                    max_positions_per_attributes,
                )
            })
@ -289,6 +293,8 @@ fn send_and_extract_flattened_documents_data(
    geo_fields_ids: Option<(FieldId, FieldId)>,
    vectors_field_id: Option<FieldId>,
    stop_words: &Option<fst::Set<&[u8]>>,
+    allowed_separators: &Option<Vec<&str>>,
+    dictionary: &Option<Vec<&str>>,
    max_positions_per_attributes: Option<u32>,
 ) -> Result<(
    grenad::Reader<CursorClonableMmap>,
@ -344,6 +350,8 @@ fn send_and_extract_flattened_documents_data(
                        indexer,
                        searchable_fields,
                        stop_words.as_ref(),
+                        allowed_separators.as_ref(),
+                        dictionary.as_ref(),
                        max_positions_per_attributes,
                    )?;

--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@ -316,6 +316,12 @@ where
        let vectors_field_id = self.index.fields_ids_map(self.wtxn)?.id("_vectors");

        let stop_words = self.index.stop_words(self.wtxn)?;
+        let separators = self.index.allowed_separators(self.wtxn)?;
+        let separators: Option<Vec<_>> =
+            separators.as_ref().map(|x| x.iter().map(String::as_str).collect());
+        let dictionary = self.index.dictionary(self.wtxn)?;
+        let dictionary: Option<Vec<_>> =
+            dictionary.as_ref().map(|x| x.iter().map(String::as_str).collect());
        let exact_attributes = self.index.exact_attributes_ids(self.wtxn)?;

        let pool_params = GrenadParameters {
@ -353,6 +359,8 @@ where
                    geo_fields_ids,
                    vectors_field_id,
                    stop_words,
+                    separators,
+                    dictionary,
                    max_positions_per_attributes,
                    exact_attributes,
                )