From 057fcb39932089325fd2101fb5040cb14e202374 Mon Sep 17 00:00:00 2001 From: Lukas Kalbertodt Date: Wed, 20 Nov 2024 01:00:43 +0100 Subject: [PATCH] Add `indices` field to `_matchesPosition` to specify where in an array a match comes from (#5005) * Remove unreachable code * Add `indices` field to `MatchBounds` For matches inside arrays, this field holds the indices of the array elements that matched. For example, searching for `cat` inside `{ "a": ["dog", "cat", "fox"] }` would return `indices: [1]`. For nested arrays, this contains multiple indices, starting with the one for the top-most array. For matches in fields without arrays, `indices` is not serialized (does not exist) to save space. --- crates/meilisearch/src/search/mod.rs | 124 +++++++------------ crates/meilisearch/tests/search/formatted.rs | 5 +- crates/milli/src/search/new/matches/mod.rs | 11 +- crates/permissive-json-pointer/src/lib.rs | 96 +++++++++++--- 4 files changed, 139 insertions(+), 97 deletions(-) diff --git a/crates/meilisearch/src/search/mod.rs b/crates/meilisearch/src/search/mod.rs index 7832c1761..241c3ab81 100644 --- a/crates/meilisearch/src/search/mod.rs +++ b/crates/meilisearch/src/search/mod.rs @@ -1733,46 +1733,51 @@ fn format_fields( // select the attributes to retrieve let displayable_names = displayable_ids.iter().map(|&fid| field_ids_map.name(fid).expect("Missing field name")); - permissive_json_pointer::map_leaf_values(&mut document, displayable_names, |key, value| { - // To get the formatting option of each key we need to see all the rules that applies - // to the value and merge them together. eg. If a user said he wanted to highlight `doggo` - // and crop `doggo.name`. `doggo.name` needs to be highlighted + cropped while `doggo.age` is only - // highlighted. - // Warn: The time to compute the format list scales with the number of fields to format; - // cumulated with map_leaf_values that iterates over all the nested fields, it gives a quadratic complexity: - // d*f where d is the total number of fields to display and f is the total number of fields to format. - let format = formatting_fields_options - .iter() - .filter(|(name, _option)| { - milli::is_faceted_by(name, key) || milli::is_faceted_by(key, name) - }) - .map(|(_, option)| **option) - .reduce(|acc, option| acc.merge(option)); - let mut infos = Vec::new(); - - // if no locales has been provided, we try to find the locales in the localized_attributes. - let locales = locales.or_else(|| { - localized_attributes + permissive_json_pointer::map_leaf_values( + &mut document, + displayable_names, + |key, array_indices, value| { + // To get the formatting option of each key we need to see all the rules that applies + // to the value and merge them together. eg. If a user said he wanted to highlight `doggo` + // and crop `doggo.name`. `doggo.name` needs to be highlighted + cropped while `doggo.age` is only + // highlighted. + // Warn: The time to compute the format list scales with the number of fields to format; + // cumulated with map_leaf_values that iterates over all the nested fields, it gives a quadratic complexity: + // d*f where d is the total number of fields to display and f is the total number of fields to format. + let format = formatting_fields_options .iter() - .find(|rule| rule.match_str(key)) - .map(LocalizedAttributesRule::locales) - }); + .filter(|(name, _option)| { + milli::is_faceted_by(name, key) || milli::is_faceted_by(key, name) + }) + .map(|(_, option)| **option) + .reduce(|acc, option| acc.merge(option)); + let mut infos = Vec::new(); - *value = format_value( - std::mem::take(value), - builder, - format, - &mut infos, - compute_matches, - locales, - ); + // if no locales has been provided, we try to find the locales in the localized_attributes. + let locales = locales.or_else(|| { + localized_attributes + .iter() + .find(|rule| rule.match_str(key)) + .map(LocalizedAttributesRule::locales) + }); - if let Some(matches) = matches_position.as_mut() { - if !infos.is_empty() { - matches.insert(key.to_owned(), infos); + *value = format_value( + std::mem::take(value), + builder, + format, + &mut infos, + compute_matches, + array_indices, + locales, + ); + + if let Some(matches) = matches_position.as_mut() { + if !infos.is_empty() { + matches.insert(key.to_owned(), infos); + } } - } - }); + }, + ); let selectors = formatted_options .keys() @@ -1790,13 +1795,14 @@ fn format_value( format_options: Option, infos: &mut Vec, compute_matches: bool, + array_indices: &[usize], locales: Option<&[Language]>, ) -> Value { match value { Value::String(old_string) => { let mut matcher = builder.build(&old_string, locales); if compute_matches { - let matches = matcher.matches(); + let matches = matcher.matches(array_indices); infos.extend_from_slice(&matches[..]); } @@ -1808,51 +1814,15 @@ fn format_value( None => Value::String(old_string), } } - Value::Array(values) => Value::Array( - values - .into_iter() - .map(|v| { - format_value( - v, - builder, - format_options.map(|format_options| FormatOptions { - highlight: format_options.highlight, - crop: None, - }), - infos, - compute_matches, - locales, - ) - }) - .collect(), - ), - Value::Object(object) => Value::Object( - object - .into_iter() - .map(|(k, v)| { - ( - k, - format_value( - v, - builder, - format_options.map(|format_options| FormatOptions { - highlight: format_options.highlight, - crop: None, - }), - infos, - compute_matches, - locales, - ), - ) - }) - .collect(), - ), + // `map_leaf_values` makes sure this is only called for leaf fields + Value::Array(_) => unreachable!(), + Value::Object(_) => unreachable!(), Value::Number(number) => { let s = number.to_string(); let mut matcher = builder.build(&s, locales); if compute_matches { - let matches = matcher.matches(); + let matches = matcher.matches(array_indices); infos.extend_from_slice(&matches[..]); } diff --git a/crates/meilisearch/tests/search/formatted.rs b/crates/meilisearch/tests/search/formatted.rs index 1484b6393..ee33939fd 100644 --- a/crates/meilisearch/tests/search/formatted.rs +++ b/crates/meilisearch/tests/search/formatted.rs @@ -208,7 +208,10 @@ async fn format_nested() { "doggos.name": [ { "start": 0, - "length": 5 + "length": 5, + "indices": [ + 0 + ] } ] } diff --git a/crates/milli/src/search/new/matches/mod.rs b/crates/milli/src/search/new/matches/mod.rs index 80e3ec7b2..7d8d25502 100644 --- a/crates/milli/src/search/new/matches/mod.rs +++ b/crates/milli/src/search/new/matches/mod.rs @@ -105,6 +105,8 @@ impl FormatOptions { pub struct MatchBounds { pub start: usize, pub length: usize, + #[serde(skip_serializing_if = "Option::is_none")] + pub indices: Option>, } /// Structure used to analyze a string, compute words that match, @@ -220,15 +222,20 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { } /// Returns boundaries of the words that match the query. - pub fn matches(&mut self) -> Vec { + pub fn matches(&mut self, array_indices: &[usize]) -> Vec { match &self.matches { - None => self.compute_matches().matches(), + None => self.compute_matches().matches(array_indices), Some((tokens, matches)) => matches .iter() .map(|m| MatchBounds { start: tokens[m.get_first_token_pos()].byte_start, // TODO: Why is this in chars, while start is in bytes? length: m.char_count, + indices: if array_indices.is_empty() { + None + } else { + Some(array_indices.to_owned()) + }, }) .collect(), } diff --git a/crates/permissive-json-pointer/src/lib.rs b/crates/permissive-json-pointer/src/lib.rs index c771f3321..a4e16fedf 100644 --- a/crates/permissive-json-pointer/src/lib.rs +++ b/crates/permissive-json-pointer/src/lib.rs @@ -45,7 +45,7 @@ fn contained_in(selector: &str, key: &str) -> bool { /// map_leaf_values( /// value.as_object_mut().unwrap(), /// ["jean.race.name"], -/// |key, value| match (value, key) { +/// |key, _array_indices, value| match (value, key) { /// (Value::String(name), "jean.race.name") => *name = "patou".to_string(), /// _ => unreachable!(), /// }, @@ -66,17 +66,18 @@ fn contained_in(selector: &str, key: &str) -> bool { pub fn map_leaf_values<'a>( value: &mut Map, selectors: impl IntoIterator, - mut mapper: impl FnMut(&str, &mut Value), + mut mapper: impl FnMut(&str, &[usize], &mut Value), ) { let selectors: Vec<_> = selectors.into_iter().collect(); - map_leaf_values_in_object(value, &selectors, "", &mut mapper); + map_leaf_values_in_object(value, &selectors, "", &[], &mut mapper); } pub fn map_leaf_values_in_object( value: &mut Map, selectors: &[&str], base_key: &str, - mapper: &mut impl FnMut(&str, &mut Value), + array_indices: &[usize], + mapper: &mut impl FnMut(&str, &[usize], &mut Value), ) { for (key, value) in value.iter_mut() { let base_key = if base_key.is_empty() { @@ -94,12 +95,12 @@ pub fn map_leaf_values_in_object( if should_continue { match value { Value::Object(object) => { - map_leaf_values_in_object(object, selectors, &base_key, mapper) + map_leaf_values_in_object(object, selectors, &base_key, array_indices, mapper) } Value::Array(array) => { - map_leaf_values_in_array(array, selectors, &base_key, mapper) + map_leaf_values_in_array(array, selectors, &base_key, array_indices, mapper) } - value => mapper(&base_key, value), + value => mapper(&base_key, array_indices, value), } } } @@ -109,13 +110,24 @@ pub fn map_leaf_values_in_array( values: &mut [Value], selectors: &[&str], base_key: &str, - mapper: &mut impl FnMut(&str, &mut Value), + base_array_indices: &[usize], + mapper: &mut impl FnMut(&str, &[usize], &mut Value), ) { - for value in values.iter_mut() { + // This avoids allocating twice + let mut array_indices = Vec::with_capacity(base_array_indices.len() + 1); + array_indices.extend_from_slice(base_array_indices); + array_indices.push(0); + + for (i, value) in values.iter_mut().enumerate() { + *array_indices.last_mut().unwrap() = i; match value { - Value::Object(object) => map_leaf_values_in_object(object, selectors, base_key, mapper), - Value::Array(array) => map_leaf_values_in_array(array, selectors, base_key, mapper), - value => mapper(base_key, value), + Value::Object(object) => { + map_leaf_values_in_object(object, selectors, base_key, &array_indices, mapper) + } + Value::Array(array) => { + map_leaf_values_in_array(array, selectors, base_key, &array_indices, mapper) + } + value => mapper(base_key, &array_indices, value), } } } @@ -743,12 +755,14 @@ mod tests { } }); - map_leaf_values(value.as_object_mut().unwrap(), ["jean.race.name"], |key, value| { - match (value, key) { + map_leaf_values( + value.as_object_mut().unwrap(), + ["jean.race.name"], + |key, _, value| match (value, key) { (Value::String(name), "jean.race.name") => *name = S("patou"), _ => unreachable!(), - } - }); + }, + ); assert_eq!( value, @@ -775,7 +789,7 @@ mod tests { }); let mut calls = 0; - map_leaf_values(value.as_object_mut().unwrap(), ["jean"], |key, value| { + map_leaf_values(value.as_object_mut().unwrap(), ["jean"], |key, _, value| { calls += 1; match (value, key) { (Value::String(name), "jean.race.name") => *name = S("patou"), @@ -798,4 +812,52 @@ mod tests { }) ); } + + #[test] + fn map_array() { + let mut value: Value = json!({ + "no_array": "peter", + "simple": ["foo", "bar"], + "nested": [ + { + "a": [ + ["cat", "dog"], + ["fox", "bear"], + ], + "b": "hi", + }, + { + "a": ["green", "blue"], + }, + ], + }); + + map_leaf_values( + value.as_object_mut().unwrap(), + ["no_array", "simple", "nested"], + |_key, array_indices, value| { + *value = format!("{array_indices:?}").into(); + }, + ); + + assert_eq!( + value, + json!({ + "no_array": "[]", + "simple": ["[0]", "[1]"], + "nested": [ + { + "a": [ + ["[0, 0, 0]", "[0, 0, 1]"], + ["[0, 1, 0]", "[0, 1, 1]"], + ], + "b": "[0]", + }, + { + "a": ["[1, 0]", "[1, 1]"], + }, + ], + }) + ); + } }