Parse every attributes and filter before tokenization

This commit is contained in:
ManyTheFish 2024-11-20 15:08:27 +01:00
parent ff9c92c409
commit 4d616f8794
3 changed files with 33 additions and 50 deletions

View File

@ -1623,15 +1623,14 @@ async fn change_attributes_settings() {
index index
.search( .search(
json!({ json!({
"q": "bobby" "q": "bobby",
"attributesToRetrieve": ["id", "doggos"]
}), }),
|response, code| { |response, code| {
assert_eq!(code, 200, "{}", response); assert_eq!(code, 200, "{}", response);
meili_snap::snapshot!(meili_snap::json_string!(response["hits"]), @r###" meili_snap::snapshot!(meili_snap::json_string!(response["hits"]), @r###"
[ [
{ {
"father": "jean",
"mother": "michelle",
"id": 852, "id": 852,
"doggos": [ "doggos": [
{ {
@ -1642,15 +1641,7 @@ async fn change_attributes_settings() {
"name": "buddy", "name": "buddy",
"age": 4 "age": 4
} }
], ]
"cattos": "pésti",
"_vectors": {
"manual": [
1.0,
2.0,
3.0
]
}
} }
] ]
"###); "###);
@ -1663,15 +1654,14 @@ async fn change_attributes_settings() {
.search( .search(
json!({ json!({
"q": "", "q": "",
"filter": "doggos.age < 5" "filter": "doggos.age < 5",
"attributesToRetrieve": ["id", "doggos"]
}), }),
|response, code| { |response, code| {
assert_eq!(code, 200, "{}", response); assert_eq!(code, 200, "{}", response);
meili_snap::snapshot!(meili_snap::json_string!(response["hits"]), @r###" meili_snap::snapshot!(meili_snap::json_string!(response["hits"]), @r###"
[ [
{ {
"father": "jean",
"mother": "michelle",
"id": 852, "id": 852,
"doggos": [ "doggos": [
{ {
@ -1682,15 +1672,7 @@ async fn change_attributes_settings() {
"name": "buddy", "name": "buddy",
"age": 4 "age": 4
} }
], ]
"cattos": "pésti",
"_vectors": {
"manual": [
1.0,
2.0,
3.0
]
}
} }
] ]
"###); "###);

View File

@ -75,12 +75,12 @@ pub trait SearchableExtractor: Sized + Sync {
let dictionary = indexing_context.index.dictionary(&rtxn)?; let dictionary = indexing_context.index.dictionary(&rtxn)?;
let dictionary: Option<Vec<_>> = let dictionary: Option<Vec<_>> =
dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect()); dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect());
let builder = tokenizer_builder( let mut builder = tokenizer_builder(
stop_words.as_ref(), stop_words.as_ref(),
allowed_separators.as_deref(), allowed_separators.as_deref(),
dictionary.as_deref(), dictionary.as_deref(),
); );
let tokenizer = builder.into_tokenizer(); let tokenizer = builder.build();
let attributes_to_extract = Self::attributes_to_extract(&rtxn, indexing_context.index)?; let attributes_to_extract = Self::attributes_to_extract(&rtxn, indexing_context.index)?;
let attributes_to_skip = Self::attributes_to_skip(&rtxn, indexing_context.index)?; let attributes_to_skip = Self::attributes_to_skip(&rtxn, indexing_context.index)?;

View File

@ -40,6 +40,12 @@ impl<'a> DocumentTokenizer<'a> {
return Err(UserError::AttributeLimitReached.into()); return Err(UserError::AttributeLimitReached.into());
}; };
if select_field(field_name, self.attribute_to_extract, self.attribute_to_skip)
!= Selection::Select
{
return Ok(());
}
let position = field_position let position = field_position
.entry(field_id) .entry(field_id)
.and_modify(|counter| *counter += MAX_DISTANCE) .and_modify(|counter| *counter += MAX_DISTANCE)
@ -87,30 +93,25 @@ impl<'a> DocumentTokenizer<'a> {
Ok(()) Ok(())
}; };
// if the current field is searchable or contains a searchable attribute // parse json.
if select_field(field_name, self.attribute_to_extract, self.attribute_to_skip) match serde_json::to_value(value).map_err(InternalError::SerdeJson)? {
!= Selection::Skip Value::Object(object) => seek_leaf_values_in_object(
{ &object,
// parse json. None,
match serde_json::to_value(value).map_err(InternalError::SerdeJson)? { &[],
Value::Object(object) => seek_leaf_values_in_object( field_name,
&object, Depth::OnBaseKey,
self.attribute_to_extract, &mut tokenize_field,
self.attribute_to_skip, )?,
field_name, Value::Array(array) => seek_leaf_values_in_array(
Depth::OnBaseKey, &array,
&mut tokenize_field, None,
)?, &[],
Value::Array(array) => seek_leaf_values_in_array( field_name,
&array, Depth::OnBaseKey,
self.attribute_to_extract, &mut tokenize_field,
self.attribute_to_skip, )?,
field_name, value => tokenize_field(field_name, Depth::OnBaseKey, &value)?,
Depth::OnBaseKey,
&mut tokenize_field,
)?,
value => tokenize_field(field_name, Depth::OnBaseKey, &value)?,
}
} }
} }