From 4d616f8794cfc1d60ed309b44d83f000361bce55 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 20 Nov 2024 15:08:27 +0100 Subject: [PATCH] Parse every attributes and filter before tokenization --- crates/meilisearch/tests/search/mod.rs | 30 +++--------- .../src/update/new/extract/searchable/mod.rs | 4 +- .../extract/searchable/tokenize_document.rs | 49 ++++++++++--------- 3 files changed, 33 insertions(+), 50 deletions(-) diff --git a/crates/meilisearch/tests/search/mod.rs b/crates/meilisearch/tests/search/mod.rs index e39d0fc5a..8cafe1dd8 100644 --- a/crates/meilisearch/tests/search/mod.rs +++ b/crates/meilisearch/tests/search/mod.rs @@ -1623,15 +1623,14 @@ async fn change_attributes_settings() { index .search( json!({ - "q": "bobby" + "q": "bobby", + "attributesToRetrieve": ["id", "doggos"] }), |response, code| { assert_eq!(code, 200, "{}", response); meili_snap::snapshot!(meili_snap::json_string!(response["hits"]), @r###" [ { - "father": "jean", - "mother": "michelle", "id": 852, "doggos": [ { @@ -1642,15 +1641,7 @@ async fn change_attributes_settings() { "name": "buddy", "age": 4 } - ], - "cattos": "pésti", - "_vectors": { - "manual": [ - 1.0, - 2.0, - 3.0 - ] - } + ] } ] "###); @@ -1663,15 +1654,14 @@ async fn change_attributes_settings() { .search( json!({ "q": "", - "filter": "doggos.age < 5" + "filter": "doggos.age < 5", + "attributesToRetrieve": ["id", "doggos"] }), |response, code| { assert_eq!(code, 200, "{}", response); meili_snap::snapshot!(meili_snap::json_string!(response["hits"]), @r###" [ { - "father": "jean", - "mother": "michelle", "id": 852, "doggos": [ { @@ -1682,15 +1672,7 @@ async fn change_attributes_settings() { "name": "buddy", "age": 4 } - ], - "cattos": "pésti", - "_vectors": { - "manual": [ - 1.0, - 2.0, - 3.0 - ] - } + ] } ] "###); diff --git a/crates/milli/src/update/new/extract/searchable/mod.rs b/crates/milli/src/update/new/extract/searchable/mod.rs index b61dfcf92..05d2406d9 100644 --- a/crates/milli/src/update/new/extract/searchable/mod.rs +++ b/crates/milli/src/update/new/extract/searchable/mod.rs @@ -75,12 +75,12 @@ pub trait SearchableExtractor: Sized + Sync { let dictionary = indexing_context.index.dictionary(&rtxn)?; let dictionary: Option> = dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect()); - let builder = tokenizer_builder( + let mut builder = tokenizer_builder( stop_words.as_ref(), allowed_separators.as_deref(), dictionary.as_deref(), ); - let tokenizer = builder.into_tokenizer(); + let tokenizer = builder.build(); let attributes_to_extract = Self::attributes_to_extract(&rtxn, indexing_context.index)?; let attributes_to_skip = Self::attributes_to_skip(&rtxn, indexing_context.index)?; diff --git a/crates/milli/src/update/new/extract/searchable/tokenize_document.rs b/crates/milli/src/update/new/extract/searchable/tokenize_document.rs index cc86adfeb..ffdce5b7e 100644 --- a/crates/milli/src/update/new/extract/searchable/tokenize_document.rs +++ b/crates/milli/src/update/new/extract/searchable/tokenize_document.rs @@ -40,6 +40,12 @@ impl<'a> DocumentTokenizer<'a> { return Err(UserError::AttributeLimitReached.into()); }; + if select_field(field_name, self.attribute_to_extract, self.attribute_to_skip) + != Selection::Select + { + return Ok(()); + } + let position = field_position .entry(field_id) .and_modify(|counter| *counter += MAX_DISTANCE) @@ -87,30 +93,25 @@ impl<'a> DocumentTokenizer<'a> { Ok(()) }; - // if the current field is searchable or contains a searchable attribute - if select_field(field_name, self.attribute_to_extract, self.attribute_to_skip) - != Selection::Skip - { - // parse json. - match serde_json::to_value(value).map_err(InternalError::SerdeJson)? { - Value::Object(object) => seek_leaf_values_in_object( - &object, - self.attribute_to_extract, - self.attribute_to_skip, - field_name, - Depth::OnBaseKey, - &mut tokenize_field, - )?, - Value::Array(array) => seek_leaf_values_in_array( - &array, - self.attribute_to_extract, - self.attribute_to_skip, - field_name, - Depth::OnBaseKey, - &mut tokenize_field, - )?, - value => tokenize_field(field_name, Depth::OnBaseKey, &value)?, - } + // parse json. + match serde_json::to_value(value).map_err(InternalError::SerdeJson)? { + Value::Object(object) => seek_leaf_values_in_object( + &object, + None, + &[], + field_name, + Depth::OnBaseKey, + &mut tokenize_field, + )?, + Value::Array(array) => seek_leaf_values_in_array( + &array, + None, + &[], + field_name, + Depth::OnBaseKey, + &mut tokenize_field, + )?, + value => tokenize_field(field_name, Depth::OnBaseKey, &value)?, } }