Classify the NULL fields values in the facet extractor

This commit is contained in:
Clément Renault 2023-03-08 16:46:42 +01:00
parent 9287858997
commit 19ab4d1a15
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4
2 changed files with 78 additions and 36 deletions

View File

@ -8,7 +8,7 @@ use charabia::normalizer::{CharNormalizer, CompatibilityDecompositionNormalizer}
use heed::zerocopy::AsBytes; use heed::zerocopy::AsBytes;
use heed::BytesEncode; use heed::BytesEncode;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use serde_json::Value; use serde_json::{from_slice, Value};
use super::helpers::{create_sorter, keep_first, sorter_into_reader, GrenadParameters}; use super::helpers::{create_sorter, keep_first, sorter_into_reader, GrenadParameters};
use crate::error::InternalError; use crate::error::InternalError;
@ -25,7 +25,8 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
obkv_documents: grenad::Reader<R>, obkv_documents: grenad::Reader<R>,
indexer: GrenadParameters, indexer: GrenadParameters,
faceted_fields: &HashSet<FieldId>, faceted_fields: &HashSet<FieldId>,
) -> Result<(grenad::Reader<File>, grenad::Reader<File>, grenad::Reader<File>)> { ) -> Result<(grenad::Reader<File>, grenad::Reader<File>, grenad::Reader<File>, grenad::Reader<File>)>
{
let max_memory = indexer.max_memory_by_thread(); let max_memory = indexer.max_memory_by_thread();
let mut fid_docid_facet_numbers_sorter = create_sorter( let mut fid_docid_facet_numbers_sorter = create_sorter(
@ -47,6 +48,7 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
); );
let mut facet_exists_docids = BTreeMap::<FieldId, RoaringBitmap>::new(); let mut facet_exists_docids = BTreeMap::<FieldId, RoaringBitmap>::new();
let mut facet_is_null_docids = BTreeMap::<FieldId, RoaringBitmap>::new();
let mut key_buffer = Vec::new(); let mut key_buffer = Vec::new();
let mut cursor = obkv_documents.into_cursor()?; let mut cursor = obkv_documents.into_cursor()?;
@ -70,33 +72,40 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
// For the other extraction tasks, prefix the key with the field_id and the document_id // For the other extraction tasks, prefix the key with the field_id and the document_id
key_buffer.extend_from_slice(docid_bytes); key_buffer.extend_from_slice(docid_bytes);
let value = let value = from_slice(field_bytes).map_err(InternalError::SerdeJson)?;
serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?; match extract_facet_values(&value) {
FilterableValues::Null => {
let (numbers, strings) = extract_facet_values(&value); facet_is_null_docids.entry(field_id).or_default().insert(document);
// insert facet numbers in sorter
for number in numbers {
key_buffer.truncate(size_of::<FieldId>() + size_of::<DocumentId>());
if let Some(value_bytes) = f64_into_bytes(number) {
key_buffer.extend_from_slice(&value_bytes);
key_buffer.extend_from_slice(&number.to_be_bytes());
fid_docid_facet_numbers_sorter.insert(&key_buffer, ().as_bytes())?;
} }
} FilterableValues::Values { numbers, strings } => {
// insert facet numbers in sorter
for number in numbers {
key_buffer.truncate(size_of::<FieldId>() + size_of::<DocumentId>());
if let Some(value_bytes) = f64_into_bytes(number) {
key_buffer.extend_from_slice(&value_bytes);
key_buffer.extend_from_slice(&number.to_be_bytes());
// insert normalized and original facet string in sorter fid_docid_facet_numbers_sorter
for (normalized, original) in strings.into_iter().filter(|(n, _)| !n.is_empty()) { .insert(&key_buffer, ().as_bytes())?;
let normalised_truncated_value: String = normalized }
.char_indices() }
.take_while(|(idx, _)| idx + 4 < MAX_FACET_VALUE_LENGTH)
.map(|(_, c)| c)
.collect();
key_buffer.truncate(size_of::<FieldId>() + size_of::<DocumentId>()); // insert normalized and original facet string in sorter
key_buffer.extend_from_slice(normalised_truncated_value.as_bytes()); for (normalized, original) in
fid_docid_facet_strings_sorter.insert(&key_buffer, original.as_bytes())?; strings.into_iter().filter(|(n, _)| !n.is_empty())
{
let normalised_truncated_value: String = normalized
.char_indices()
.take_while(|(idx, _)| idx + 4 < MAX_FACET_VALUE_LENGTH)
.map(|(_, c)| c)
.collect();
key_buffer.truncate(size_of::<FieldId>() + size_of::<DocumentId>());
key_buffer.extend_from_slice(normalised_truncated_value.as_bytes());
fid_docid_facet_strings_sorter
.insert(&key_buffer, original.as_bytes())?;
}
}
} }
} }
} }
@ -113,14 +122,36 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
} }
let facet_exists_docids_reader = writer_into_reader(facet_exists_docids_writer)?; let facet_exists_docids_reader = writer_into_reader(facet_exists_docids_writer)?;
let mut facet_is_null_docids_writer = create_writer(
indexer.chunk_compression_type,
indexer.chunk_compression_level,
tempfile::tempfile()?,
);
for (fid, bitmap) in facet_is_null_docids.into_iter() {
let bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&bitmap).unwrap();
facet_is_null_docids_writer.insert(fid.to_be_bytes(), &bitmap_bytes)?;
}
let facet_is_null_docids_reader = writer_into_reader(facet_is_null_docids_writer)?;
Ok(( Ok((
sorter_into_reader(fid_docid_facet_numbers_sorter, indexer)?, sorter_into_reader(fid_docid_facet_numbers_sorter, indexer)?,
sorter_into_reader(fid_docid_facet_strings_sorter, indexer)?, sorter_into_reader(fid_docid_facet_strings_sorter, indexer)?,
facet_is_null_docids_reader,
facet_exists_docids_reader, facet_exists_docids_reader,
)) ))
} }
fn extract_facet_values(value: &Value) -> (Vec<f64>, Vec<(String, String)>) { /// Represent what a document field contains.
enum FilterableValues {
Null,
/// Represents all the numbers and strings values found in this document field.
Values {
numbers: Vec<f64>,
strings: Vec<(String, String)>,
},
}
fn extract_facet_values(value: &Value) -> FilterableValues {
fn inner_extract_facet_values( fn inner_extract_facet_values(
value: &Value, value: &Value,
can_recurse: bool, can_recurse: bool,
@ -152,9 +183,13 @@ fn extract_facet_values(value: &Value) -> (Vec<f64>, Vec<(String, String)>) {
} }
} }
let mut facet_number_values = Vec::new(); match value {
let mut facet_string_values = Vec::new(); Value::Null => FilterableValues::Null,
inner_extract_facet_values(value, true, &mut facet_number_values, &mut facet_string_values); otherwise => {
let mut numbers = Vec::new();
(facet_number_values, facet_string_values) let mut strings = Vec::new();
inner_extract_facet_values(otherwise, true, &mut numbers, &mut strings);
FilterableValues::Values { numbers, strings }
}
}
} }

View File

@ -55,7 +55,7 @@ pub(crate) fn data_from_obkv_documents(
.collect::<Result<()>>()?; .collect::<Result<()>>()?;
#[allow(clippy::type_complexity)] #[allow(clippy::type_complexity)]
let result: Result<(Vec<_>, (Vec<_>, (Vec<_>, Vec<_>)))> = flattened_obkv_chunks let result: Result<(Vec<_>, (Vec<_>, (Vec<_>, (Vec<_>, Vec<_>))))> = flattened_obkv_chunks
.par_bridge() .par_bridge()
.map(|flattened_obkv_chunks| { .map(|flattened_obkv_chunks| {
send_and_extract_flattened_documents_data( send_and_extract_flattened_documents_data(
@ -76,7 +76,10 @@ pub(crate) fn data_from_obkv_documents(
docid_word_positions_chunks, docid_word_positions_chunks,
( (
docid_fid_facet_numbers_chunks, docid_fid_facet_numbers_chunks,
(docid_fid_facet_strings_chunks, facet_exists_docids_chunks), (
docid_fid_facet_strings_chunks,
(facet_is_null_docids_chunks, facet_exists_docids_chunks),
),
), ),
) = result?; ) = result?;
@ -235,7 +238,7 @@ fn send_and_extract_flattened_documents_data(
grenad::Reader<CursorClonableMmap>, grenad::Reader<CursorClonableMmap>,
( (
grenad::Reader<CursorClonableMmap>, grenad::Reader<CursorClonableMmap>,
(grenad::Reader<CursorClonableMmap>, grenad::Reader<File>), (grenad::Reader<CursorClonableMmap>, (grenad::Reader<File>, grenad::Reader<File>)),
), ),
)> { )> {
let flattened_documents_chunk = let flattened_documents_chunk =
@ -284,6 +287,7 @@ fn send_and_extract_flattened_documents_data(
let ( let (
docid_fid_facet_numbers_chunk, docid_fid_facet_numbers_chunk,
docid_fid_facet_strings_chunk, docid_fid_facet_strings_chunk,
fid_facet_is_null_docids_chunk,
fid_facet_exists_docids_chunk, fid_facet_exists_docids_chunk,
) = extract_fid_docid_facet_values( ) = extract_fid_docid_facet_values(
flattened_documents_chunk.clone(), flattened_documents_chunk.clone(),
@ -309,7 +313,10 @@ fn send_and_extract_flattened_documents_data(
Ok(( Ok((
docid_fid_facet_numbers_chunk, docid_fid_facet_numbers_chunk,
(docid_fid_facet_strings_chunk, fid_facet_exists_docids_chunk), (
docid_fid_facet_strings_chunk,
(fid_facet_is_null_docids_chunk, fid_facet_exists_docids_chunk),
),
)) ))
}, },
); );