diff --git a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs index 71ac330e2..5fe6a7606 100644 --- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs +++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs @@ -8,7 +8,7 @@ use charabia::normalizer::{CharNormalizer, CompatibilityDecompositionNormalizer} use heed::zerocopy::AsBytes; use heed::BytesEncode; use roaring::RoaringBitmap; -use serde_json::Value; +use serde_json::{from_slice, Value}; use super::helpers::{create_sorter, keep_first, sorter_into_reader, GrenadParameters}; use crate::error::InternalError; @@ -25,7 +25,8 @@ pub fn extract_fid_docid_facet_values( obkv_documents: grenad::Reader, indexer: GrenadParameters, faceted_fields: &HashSet, -) -> Result<(grenad::Reader, grenad::Reader, grenad::Reader)> { +) -> Result<(grenad::Reader, grenad::Reader, grenad::Reader, grenad::Reader)> +{ let max_memory = indexer.max_memory_by_thread(); let mut fid_docid_facet_numbers_sorter = create_sorter( @@ -47,6 +48,7 @@ pub fn extract_fid_docid_facet_values( ); let mut facet_exists_docids = BTreeMap::::new(); + let mut facet_is_null_docids = BTreeMap::::new(); let mut key_buffer = Vec::new(); let mut cursor = obkv_documents.into_cursor()?; @@ -70,33 +72,40 @@ pub fn extract_fid_docid_facet_values( // For the other extraction tasks, prefix the key with the field_id and the document_id key_buffer.extend_from_slice(docid_bytes); - let value = - serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?; - - let (numbers, strings) = extract_facet_values(&value); - - // insert facet numbers in sorter - for number in numbers { - key_buffer.truncate(size_of::() + size_of::()); - if let Some(value_bytes) = f64_into_bytes(number) { - key_buffer.extend_from_slice(&value_bytes); - key_buffer.extend_from_slice(&number.to_be_bytes()); - - fid_docid_facet_numbers_sorter.insert(&key_buffer, ().as_bytes())?; + let value = from_slice(field_bytes).map_err(InternalError::SerdeJson)?; + match extract_facet_values(&value) { + FilterableValues::Null => { + facet_is_null_docids.entry(field_id).or_default().insert(document); } - } + FilterableValues::Values { numbers, strings } => { + // insert facet numbers in sorter + for number in numbers { + key_buffer.truncate(size_of::() + size_of::()); + if let Some(value_bytes) = f64_into_bytes(number) { + key_buffer.extend_from_slice(&value_bytes); + key_buffer.extend_from_slice(&number.to_be_bytes()); - // insert normalized and original facet string in sorter - for (normalized, original) in strings.into_iter().filter(|(n, _)| !n.is_empty()) { - let normalised_truncated_value: String = normalized - .char_indices() - .take_while(|(idx, _)| idx + 4 < MAX_FACET_VALUE_LENGTH) - .map(|(_, c)| c) - .collect(); + fid_docid_facet_numbers_sorter + .insert(&key_buffer, ().as_bytes())?; + } + } - key_buffer.truncate(size_of::() + size_of::()); - key_buffer.extend_from_slice(normalised_truncated_value.as_bytes()); - fid_docid_facet_strings_sorter.insert(&key_buffer, original.as_bytes())?; + // insert normalized and original facet string in sorter + for (normalized, original) in + strings.into_iter().filter(|(n, _)| !n.is_empty()) + { + let normalised_truncated_value: String = normalized + .char_indices() + .take_while(|(idx, _)| idx + 4 < MAX_FACET_VALUE_LENGTH) + .map(|(_, c)| c) + .collect(); + + key_buffer.truncate(size_of::() + size_of::()); + key_buffer.extend_from_slice(normalised_truncated_value.as_bytes()); + fid_docid_facet_strings_sorter + .insert(&key_buffer, original.as_bytes())?; + } + } } } } @@ -113,14 +122,36 @@ pub fn extract_fid_docid_facet_values( } let facet_exists_docids_reader = writer_into_reader(facet_exists_docids_writer)?; + let mut facet_is_null_docids_writer = create_writer( + indexer.chunk_compression_type, + indexer.chunk_compression_level, + tempfile::tempfile()?, + ); + for (fid, bitmap) in facet_is_null_docids.into_iter() { + let bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&bitmap).unwrap(); + facet_is_null_docids_writer.insert(fid.to_be_bytes(), &bitmap_bytes)?; + } + let facet_is_null_docids_reader = writer_into_reader(facet_is_null_docids_writer)?; + Ok(( sorter_into_reader(fid_docid_facet_numbers_sorter, indexer)?, sorter_into_reader(fid_docid_facet_strings_sorter, indexer)?, + facet_is_null_docids_reader, facet_exists_docids_reader, )) } -fn extract_facet_values(value: &Value) -> (Vec, Vec<(String, String)>) { +/// Represent what a document field contains. +enum FilterableValues { + Null, + /// Represents all the numbers and strings values found in this document field. + Values { + numbers: Vec, + strings: Vec<(String, String)>, + }, +} + +fn extract_facet_values(value: &Value) -> FilterableValues { fn inner_extract_facet_values( value: &Value, can_recurse: bool, @@ -152,9 +183,13 @@ fn extract_facet_values(value: &Value) -> (Vec, Vec<(String, String)>) { } } - let mut facet_number_values = Vec::new(); - let mut facet_string_values = Vec::new(); - inner_extract_facet_values(value, true, &mut facet_number_values, &mut facet_string_values); - - (facet_number_values, facet_string_values) + match value { + Value::Null => FilterableValues::Null, + otherwise => { + let mut numbers = Vec::new(); + let mut strings = Vec::new(); + inner_extract_facet_values(otherwise, true, &mut numbers, &mut strings); + FilterableValues::Values { numbers, strings } + } + } } diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index c0f07cf79..9f9fc8f4f 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -55,7 +55,7 @@ pub(crate) fn data_from_obkv_documents( .collect::>()?; #[allow(clippy::type_complexity)] - let result: Result<(Vec<_>, (Vec<_>, (Vec<_>, Vec<_>)))> = flattened_obkv_chunks + let result: Result<(Vec<_>, (Vec<_>, (Vec<_>, (Vec<_>, Vec<_>))))> = flattened_obkv_chunks .par_bridge() .map(|flattened_obkv_chunks| { send_and_extract_flattened_documents_data( @@ -76,7 +76,10 @@ pub(crate) fn data_from_obkv_documents( docid_word_positions_chunks, ( docid_fid_facet_numbers_chunks, - (docid_fid_facet_strings_chunks, facet_exists_docids_chunks), + ( + docid_fid_facet_strings_chunks, + (facet_is_null_docids_chunks, facet_exists_docids_chunks), + ), ), ) = result?; @@ -235,7 +238,7 @@ fn send_and_extract_flattened_documents_data( grenad::Reader, ( grenad::Reader, - (grenad::Reader, grenad::Reader), + (grenad::Reader, (grenad::Reader, grenad::Reader)), ), )> { let flattened_documents_chunk = @@ -284,6 +287,7 @@ fn send_and_extract_flattened_documents_data( let ( docid_fid_facet_numbers_chunk, docid_fid_facet_strings_chunk, + fid_facet_is_null_docids_chunk, fid_facet_exists_docids_chunk, ) = extract_fid_docid_facet_values( flattened_documents_chunk.clone(), @@ -309,7 +313,10 @@ fn send_and_extract_flattened_documents_data( Ok(( docid_fid_facet_numbers_chunk, - (docid_fid_facet_strings_chunk, fid_facet_exists_docids_chunk), + ( + docid_fid_facet_strings_chunk, + (fid_facet_is_null_docids_chunk, fid_facet_exists_docids_chunk), + ), )) }, );