Skip iterating over documents when the faceted field list doesn't change

This commit is contained in:
ManyTheFish 2024-06-04 15:31:24 +02:00
parent 17c5ceeb9d
commit d29d4f88da

View File

@ -66,11 +66,6 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
max_memory.map(|m| m / 2), max_memory.map(|m| m / 2),
); );
let old_faceted_fids: BTreeSet<_> =
settings_diff.old.faceted_fields_ids.iter().copied().collect();
let new_faceted_fids: BTreeSet<_> =
settings_diff.new.faceted_fields_ids.iter().copied().collect();
// The tuples represents the Del and Add side for a bitmap // The tuples represents the Del and Add side for a bitmap
let mut facet_exists_docids = BTreeMap::<FieldId, (RoaringBitmap, RoaringBitmap)>::new(); let mut facet_exists_docids = BTreeMap::<FieldId, (RoaringBitmap, RoaringBitmap)>::new();
let mut facet_is_null_docids = BTreeMap::<FieldId, (RoaringBitmap, RoaringBitmap)>::new(); let mut facet_is_null_docids = BTreeMap::<FieldId, (RoaringBitmap, RoaringBitmap)>::new();
@ -80,172 +75,181 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
let mut numbers_key_buffer = Vec::new(); let mut numbers_key_buffer = Vec::new();
let mut strings_key_buffer = Vec::new(); let mut strings_key_buffer = Vec::new();
let mut cursor = obkv_documents.into_cursor()?; let old_faceted_fids: BTreeSet<_> =
while let Some((docid_bytes, value)) = cursor.move_on_next()? { settings_diff.old.faceted_fields_ids.iter().copied().collect();
let obkv = obkv::KvReader::new(value); let new_faceted_fids: BTreeSet<_> =
let get_document_json_value = move |field_id, side| { settings_diff.new.faceted_fields_ids.iter().copied().collect();
obkv.get(field_id)
.map(KvReaderDelAdd::new)
.and_then(|kv| kv.get(side))
.map(from_slice)
.transpose()
.map_err(InternalError::SerdeJson)
};
// iterate over the faceted fields instead of over the whole document.
for eob in
merge_join_by(old_faceted_fids.iter(), new_faceted_fids.iter(), |old, new| old.cmp(new))
{
let (field_id, del_value, add_value) = match eob {
EitherOrBoth::Left(&field_id) => {
let del_value = get_document_json_value(field_id, DelAdd::Deletion)?;
// deletion only if !settings_diff.settings_update_only || old_faceted_fids != new_faceted_fids {
(field_id, del_value, None) let mut cursor = obkv_documents.into_cursor()?;
} while let Some((docid_bytes, value)) = cursor.move_on_next()? {
EitherOrBoth::Right(&field_id) => { let obkv = obkv::KvReader::new(value);
let add_value = get_document_json_value(field_id, DelAdd::Addition)?; let get_document_json_value = move |field_id, side| {
obkv.get(field_id)
.map(KvReaderDelAdd::new)
.and_then(|kv| kv.get(side))
.map(from_slice)
.transpose()
.map_err(InternalError::SerdeJson)
};
// iterate over the faceted fields instead of over the whole document.
for eob in
merge_join_by(old_faceted_fids.iter(), new_faceted_fids.iter(), |old, new| {
old.cmp(new)
})
{
let (field_id, del_value, add_value) = match eob {
EitherOrBoth::Left(&field_id) => {
let del_value = get_document_json_value(field_id, DelAdd::Deletion)?;
// addition only // deletion only
(field_id, None, add_value) (field_id, del_value, None)
} }
EitherOrBoth::Both(&field_id, _) => { EitherOrBoth::Right(&field_id) => {
// during settings update, recompute the changing settings only. let add_value = get_document_json_value(field_id, DelAdd::Addition)?;
if settings_diff.settings_update_only {
continue; // addition only
(field_id, None, add_value)
}
EitherOrBoth::Both(&field_id, _) => {
// during settings update, recompute the changing settings only.
if settings_diff.settings_update_only {
continue;
}
let del_value = get_document_json_value(field_id, DelAdd::Deletion)?;
let add_value = get_document_json_value(field_id, DelAdd::Addition)?;
(field_id, del_value, add_value)
}
};
if del_value.is_some() || add_value.is_some() {
numbers_key_buffer.clear();
strings_key_buffer.clear();
// Set key to the field_id
// Note: this encoding is consistent with FieldIdCodec
numbers_key_buffer.extend_from_slice(&field_id.to_be_bytes());
strings_key_buffer.extend_from_slice(&field_id.to_be_bytes());
let document: [u8; 4] = docid_bytes[..4].try_into().ok().unwrap();
let document = DocumentId::from_be_bytes(document);
// For the other extraction tasks, prefix the key with the field_id and the document_id
numbers_key_buffer.extend_from_slice(docid_bytes);
strings_key_buffer.extend_from_slice(docid_bytes);
// We insert the document id on the Del and the Add side if the field exists.
let (ref mut del_exists, ref mut add_exists) =
facet_exists_docids.entry(field_id).or_default();
let (ref mut del_is_null, ref mut add_is_null) =
facet_is_null_docids.entry(field_id).or_default();
let (ref mut del_is_empty, ref mut add_is_empty) =
facet_is_empty_docids.entry(field_id).or_default();
if del_value.is_some() {
del_exists.insert(document);
}
if add_value.is_some() {
add_exists.insert(document);
} }
let del_value = get_document_json_value(field_id, DelAdd::Deletion)?; let del_geo_support = settings_diff
let add_value = get_document_json_value(field_id, DelAdd::Addition)?; .old
.geo_fields_ids
.map_or(false, |(lat, lng)| field_id == lat || field_id == lng);
let add_geo_support = settings_diff
.new
.geo_fields_ids
.map_or(false, |(lat, lng)| field_id == lat || field_id == lng);
let del_filterable_values =
del_value.map(|value| extract_facet_values(&value, del_geo_support));
let add_filterable_values =
add_value.map(|value| extract_facet_values(&value, add_geo_support));
(field_id, del_value, add_value) // Those closures are just here to simplify things a bit.
} let mut insert_numbers_diff = |del_numbers, add_numbers| {
}; insert_numbers_diff(
&mut fid_docid_facet_numbers_sorter,
&mut numbers_key_buffer,
del_numbers,
add_numbers,
)
};
let mut insert_strings_diff = |del_strings, add_strings| {
insert_strings_diff(
&mut fid_docid_facet_strings_sorter,
&mut strings_key_buffer,
del_strings,
add_strings,
)
};
if del_value.is_some() || add_value.is_some() { match (del_filterable_values, add_filterable_values) {
numbers_key_buffer.clear(); (None, None) => (),
strings_key_buffer.clear(); (Some(del_filterable_values), None) => match del_filterable_values {
Null => {
// Set key to the field_id
// Note: this encoding is consistent with FieldIdCodec
numbers_key_buffer.extend_from_slice(&field_id.to_be_bytes());
strings_key_buffer.extend_from_slice(&field_id.to_be_bytes());
let document: [u8; 4] = docid_bytes[..4].try_into().ok().unwrap();
let document = DocumentId::from_be_bytes(document);
// For the other extraction tasks, prefix the key with the field_id and the document_id
numbers_key_buffer.extend_from_slice(docid_bytes);
strings_key_buffer.extend_from_slice(docid_bytes);
// We insert the document id on the Del and the Add side if the field exists.
let (ref mut del_exists, ref mut add_exists) =
facet_exists_docids.entry(field_id).or_default();
let (ref mut del_is_null, ref mut add_is_null) =
facet_is_null_docids.entry(field_id).or_default();
let (ref mut del_is_empty, ref mut add_is_empty) =
facet_is_empty_docids.entry(field_id).or_default();
if del_value.is_some() {
del_exists.insert(document);
}
if add_value.is_some() {
add_exists.insert(document);
}
let del_geo_support = settings_diff
.old
.geo_fields_ids
.map_or(false, |(lat, lng)| field_id == lat || field_id == lng);
let add_geo_support = settings_diff
.new
.geo_fields_ids
.map_or(false, |(lat, lng)| field_id == lat || field_id == lng);
let del_filterable_values =
del_value.map(|value| extract_facet_values(&value, del_geo_support));
let add_filterable_values =
add_value.map(|value| extract_facet_values(&value, add_geo_support));
// Those closures are just here to simplify things a bit.
let mut insert_numbers_diff = |del_numbers, add_numbers| {
insert_numbers_diff(
&mut fid_docid_facet_numbers_sorter,
&mut numbers_key_buffer,
del_numbers,
add_numbers,
)
};
let mut insert_strings_diff = |del_strings, add_strings| {
insert_strings_diff(
&mut fid_docid_facet_strings_sorter,
&mut strings_key_buffer,
del_strings,
add_strings,
)
};
match (del_filterable_values, add_filterable_values) {
(None, None) => (),
(Some(del_filterable_values), None) => match del_filterable_values {
Null => {
del_is_null.insert(document);
}
Empty => {
del_is_empty.insert(document);
}
Values { numbers, strings } => {
insert_numbers_diff(numbers, vec![])?;
insert_strings_diff(strings, vec![])?;
}
},
(None, Some(add_filterable_values)) => match add_filterable_values {
Null => {
add_is_null.insert(document);
}
Empty => {
add_is_empty.insert(document);
}
Values { numbers, strings } => {
insert_numbers_diff(vec![], numbers)?;
insert_strings_diff(vec![], strings)?;
}
},
(Some(del_filterable_values), Some(add_filterable_values)) => {
match (del_filterable_values, add_filterable_values) {
(Null, Null) | (Empty, Empty) => (),
(Null, Empty) => {
del_is_null.insert(document);
add_is_empty.insert(document);
}
(Empty, Null) => {
del_is_empty.insert(document);
add_is_null.insert(document);
}
(Null, Values { numbers, strings }) => {
insert_numbers_diff(vec![], numbers)?;
insert_strings_diff(vec![], strings)?;
del_is_null.insert(document); del_is_null.insert(document);
} }
(Empty, Values { numbers, strings }) => { Empty => {
insert_numbers_diff(vec![], numbers)?;
insert_strings_diff(vec![], strings)?;
del_is_empty.insert(document); del_is_empty.insert(document);
} }
(Values { numbers, strings }, Null) => { Values { numbers, strings } => {
add_is_null.insert(document);
insert_numbers_diff(numbers, vec![])?; insert_numbers_diff(numbers, vec![])?;
insert_strings_diff(strings, vec![])?; insert_strings_diff(strings, vec![])?;
} }
(Values { numbers, strings }, Empty) => { },
add_is_empty.insert(document); (None, Some(add_filterable_values)) => match add_filterable_values {
insert_numbers_diff(numbers, vec![])?; Null => {
insert_strings_diff(strings, vec![])?; add_is_null.insert(document);
} }
( Empty => {
Values { numbers: del_numbers, strings: del_strings }, add_is_empty.insert(document);
Values { numbers: add_numbers, strings: add_strings }, }
) => { Values { numbers, strings } => {
insert_numbers_diff(del_numbers, add_numbers)?; insert_numbers_diff(vec![], numbers)?;
insert_strings_diff(del_strings, add_strings)?; insert_strings_diff(vec![], strings)?;
}
},
(Some(del_filterable_values), Some(add_filterable_values)) => {
match (del_filterable_values, add_filterable_values) {
(Null, Null) | (Empty, Empty) => (),
(Null, Empty) => {
del_is_null.insert(document);
add_is_empty.insert(document);
}
(Empty, Null) => {
del_is_empty.insert(document);
add_is_null.insert(document);
}
(Null, Values { numbers, strings }) => {
insert_numbers_diff(vec![], numbers)?;
insert_strings_diff(vec![], strings)?;
del_is_null.insert(document);
}
(Empty, Values { numbers, strings }) => {
insert_numbers_diff(vec![], numbers)?;
insert_strings_diff(vec![], strings)?;
del_is_empty.insert(document);
}
(Values { numbers, strings }, Null) => {
add_is_null.insert(document);
insert_numbers_diff(numbers, vec![])?;
insert_strings_diff(strings, vec![])?;
}
(Values { numbers, strings }, Empty) => {
add_is_empty.insert(document);
insert_numbers_diff(numbers, vec![])?;
insert_strings_diff(strings, vec![])?;
}
(
Values { numbers: del_numbers, strings: del_strings },
Values { numbers: add_numbers, strings: add_strings },
) => {
insert_numbers_diff(del_numbers, add_numbers)?;
insert_strings_diff(del_strings, add_strings)?;
}
} }
} }
} }