From ff4b3578bf53e654564625994a1f9f77a8a5a4ea Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 20 Nov 2024 09:44:23 +0100 Subject: [PATCH] Fix index_documents_check_exists_database --- .../new/extract/faceted/facet_document.rs | 55 ++++++++---- crates/milli/src/update/new/extract/mod.rs | 86 +++++++++++++------ .../extract/searchable/tokenize_document.rs | 6 +- 3 files changed, 103 insertions(+), 44 deletions(-) diff --git a/crates/milli/src/update/new/extract/faceted/facet_document.rs b/crates/milli/src/update/new/extract/faceted/facet_document.rs index 2820f04b9..eff529120 100644 --- a/crates/milli/src/update/new/extract/faceted/facet_document.rs +++ b/crates/milli/src/update/new/extract/faceted/facet_document.rs @@ -24,25 +24,46 @@ pub fn extract_document_facets<'doc>( }; // if the current field is searchable or contains a searchable attribute - if perm_json_p::select_field(field_name, Some(attributes_to_extract), &[]) { + let selection = perm_json_p::select_field(field_name, Some(attributes_to_extract), &[]); + if selection != perm_json_p::Selection::Skip { // parse json. match serde_json::value::to_value(value).map_err(InternalError::SerdeJson)? { - Value::Object(object) => perm_json_p::seek_leaf_values_in_object( - &object, - Some(attributes_to_extract), - &[], // skip no attributes - field_name, - perm_json_p::Depth::OnBaseKey, - &mut tokenize_field, - )?, - Value::Array(array) => perm_json_p::seek_leaf_values_in_array( - &array, - Some(attributes_to_extract), - &[], // skip no attributes - field_name, - perm_json_p::Depth::OnBaseKey, - &mut tokenize_field, - )?, + Value::Object(object) => { + perm_json_p::seek_leaf_values_in_object( + &object, + Some(attributes_to_extract), + &[], // skip no attributes + field_name, + perm_json_p::Depth::OnBaseKey, + &mut tokenize_field, + )?; + + if selection == perm_json_p::Selection::Select { + tokenize_field( + field_name, + perm_json_p::Depth::OnBaseKey, + &Value::Object(object), + )?; + } + } + Value::Array(array) => { + perm_json_p::seek_leaf_values_in_array( + &array, + Some(attributes_to_extract), + &[], // skip no attributes + field_name, + perm_json_p::Depth::OnBaseKey, + &mut tokenize_field, + )?; + + if selection == perm_json_p::Selection::Select { + tokenize_field( + field_name, + perm_json_p::Depth::OnBaseKey, + &Value::Array(array), + )?; + } + } value => tokenize_field(field_name, perm_json_p::Depth::OnBaseKey, &value)?, } } diff --git a/crates/milli/src/update/new/extract/mod.rs b/crates/milli/src/update/new/extract/mod.rs index 17ee4209c..0e4f19daf 100644 --- a/crates/milli/src/update/new/extract/mod.rs +++ b/crates/milli/src/update/new/extract/mod.rs @@ -88,25 +88,37 @@ pub mod perm_json_p { // here if the user only specified `doggo` we need to iterate in all the fields of `doggo` // so we check the contained_in on both side - let should_continue = select_field(&base_key, selectors, skip_selectors); - if should_continue { + let selection = select_field(&base_key, selectors, skip_selectors); + if selection != Selection::Skip { match value { - Value::Object(object) => seek_leaf_values_in_object( - object, - selectors, - skip_selectors, - &base_key, - Depth::OnBaseKey, - seeker, - ), - Value::Array(array) => seek_leaf_values_in_array( - array, - selectors, - skip_selectors, - &base_key, - Depth::OnBaseKey, - seeker, - ), + Value::Object(object) => { + if selection == Selection::Select { + seeker(&base_key, Depth::OnBaseKey, value)?; + } + + seek_leaf_values_in_object( + object, + selectors, + skip_selectors, + &base_key, + Depth::OnBaseKey, + seeker, + ) + } + Value::Array(array) => { + if selection == Selection::Select { + seeker(&base_key, Depth::OnBaseKey, value)?; + } + + seek_leaf_values_in_array( + array, + selectors, + skip_selectors, + &base_key, + Depth::OnBaseKey, + seeker, + ) + } value => seeker(&base_key, Depth::OnBaseKey, value), }?; } @@ -156,13 +168,37 @@ pub mod perm_json_p { field_name: &str, selectors: Option<&[&str]>, skip_selectors: &[&str], - ) -> bool { - selectors.map_or(true, |selectors| { - selectors.iter().any(|selector| { - contained_in(selector, field_name) || contained_in(field_name, selector) - }) - }) && !skip_selectors.iter().any(|skip_selector| { + ) -> Selection { + if skip_selectors.iter().any(|skip_selector| { contained_in(skip_selector, field_name) || contained_in(field_name, skip_selector) - }) + }) { + Selection::Skip + } else if let Some(selectors) = selectors { + selectors + .iter() + .filter_map(|selector| { + if contained_in(field_name, selector) { + Some(Selection::Select) + } else if contained_in(selector, field_name) { + Some(Selection::Parent) + } else { + None + } + }) + .next() + .unwrap_or(Selection::Skip) + } else { + Selection::Select + } + } + + #[derive(Debug, Clone, Copy, PartialEq, Eq)] + pub enum Selection { + /// The field is a parent of the of a nested field that must be selected + Parent, + /// The field must be selected + Select, + /// The field must be skipped + Skip, } } diff --git a/crates/milli/src/update/new/extract/searchable/tokenize_document.rs b/crates/milli/src/update/new/extract/searchable/tokenize_document.rs index 4bfcfbf16..0e9ed1826 100644 --- a/crates/milli/src/update/new/extract/searchable/tokenize_document.rs +++ b/crates/milli/src/update/new/extract/searchable/tokenize_document.rs @@ -5,7 +5,7 @@ use serde_json::Value; use crate::update::new::document::Document; use crate::update::new::extract::perm_json_p::{ - seek_leaf_values_in_array, seek_leaf_values_in_object, select_field, Depth, + seek_leaf_values_in_array, seek_leaf_values_in_object, select_field, Depth, Selection, }; use crate::{ FieldId, GlobalFieldsIdsMap, InternalError, LocalizedAttributesRule, Result, UserError, @@ -88,7 +88,9 @@ impl<'a> DocumentTokenizer<'a> { }; // if the current field is searchable or contains a searchable attribute - if select_field(field_name, self.attribute_to_extract, self.attribute_to_skip) { + if select_field(field_name, self.attribute_to_extract, self.attribute_to_skip) + != Selection::Skip + { // parse json. match serde_json::to_value(value).map_err(InternalError::SerdeJson)? { Value::Object(object) => seek_leaf_values_in_object(