712: Fix bulk facet indexing bug r=Kerollmops a=loiclec

# Pull Request

## Related issue
Fixes (partially, until merged into meilisearch) https://github.com/meilisearch/meilisearch/issues/3165

## What does this PR do?
Fixes a bug where indexing certain numbers of filterable attribute values in bulk led to corrupted facet databases. This was due to a lossy integer conversion which would ultimately prevent entire levels of the facet database to be written into LMDB.

More specifically, this change was made:
```diff
      - if cur_writer_len as u8 >= self.min_level_size {
      + if cur_writer_len >= self.min_level_size as usize {
```
I also checked other comparisons to `min_level_size` and other conversions such as `x as u8` in this part of the codebase.



Co-authored-by: Loïc Lecrenier <loic.lecrenier@me.com>
This commit is contained in:
bors[bot] 2022-11-30 16:51:48 +00:00 committed by GitHub
commit e1612fcb01
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 52 additions and 2 deletions

View File

@ -288,6 +288,8 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
for bitmap in sub_bitmaps { for bitmap in sub_bitmaps {
combined_bitmap |= bitmap; combined_bitmap |= bitmap;
} }
// The conversion of sub_bitmaps.len() to a u8 will always be correct
// since its length is bounded by max_group_size, which is a u8.
group_sizes.push(sub_bitmaps.len() as u8); group_sizes.push(sub_bitmaps.len() as u8);
left_bounds.push(left_bound); left_bounds.push(left_bound);
@ -340,7 +342,7 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
} }
} }
// if we inserted enough elements to reach the minimum level size, then we push the writer // if we inserted enough elements to reach the minimum level size, then we push the writer
if cur_writer_len as u8 >= self.min_level_size { if cur_writer_len >= self.min_level_size as usize {
sub_writers.push(writer_into_reader(cur_writer)?); sub_writers.push(writer_into_reader(cur_writer)?);
} else { } else {
// otherwise, if there are still leftover elements, we give them to the level above // otherwise, if there are still leftover elements, we give them to the level above
@ -357,11 +359,15 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
mod tests { mod tests {
use std::iter::once; use std::iter::once;
use big_s::S;
use maplit::hashset;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use crate::documents::documents_batch_reader_from_objects;
use crate::heed_codec::facet::OrderedF64Codec; use crate::heed_codec::facet::OrderedF64Codec;
use crate::milli_snap; use crate::index::tests::TempIndex;
use crate::update::facet::tests::FacetIndex; use crate::update::facet::tests::FacetIndex;
use crate::{db_snap, milli_snap};
#[test] #[test]
fn insert() { fn insert() {
@ -443,4 +449,46 @@ mod tests {
test("large_group_small_min_level", 16, 2); test("large_group_small_min_level", 16, 2);
test("odd_group_odd_min_level", 7, 3); test("odd_group_odd_min_level", 7, 3);
} }
#[test]
fn bug_3165() {
// Indexing a number of facet values that falls within certains ranges (e.g. 22_540 qualifies)
// would lead to a facet DB which was missing some levels.
// That was because before writing a level into the database, we would
// check that its size was higher than the minimum level size using
// a lossy integer conversion: `level_size as u8 >= min_level_size`.
//
// This missing level in the facet DBs would make the incremental indexer
// (and other search algorithms) crash.
//
// https://github.com/meilisearch/meilisearch/issues/3165
let index = TempIndex::new_with_map_size(4096 * 1000 * 100);
index
.update_settings(|settings| {
settings.set_primary_key("id".to_owned());
settings.set_filterable_fields(hashset! { S("id") });
})
.unwrap();
let mut documents = vec![];
for i in 0..=22_540 {
documents.push(
serde_json::json! {
{
"id": i as u64,
}
}
.as_object()
.unwrap()
.clone(),
);
}
let documents = documents_batch_reader_from_objects(documents);
index.add_documents(documents).unwrap();
db_snap!(index, facet_id_f64_docids, "initial", @"c34f499261f3510d862fa0283bbe843a");
db_snap!(index, number_faceted_documents_ids, "initial", @"01594fecbb316798ce3651d6730a4521");
}
} }

View File

@ -436,6 +436,8 @@ impl FacetsUpdateIncrementalInner {
level: highest_level + 1, level: highest_level + 1,
left_bound: first_key.unwrap().left_bound, left_bound: first_key.unwrap().left_bound,
}; };
// Note: nbr_leftover_elements can be casted to a u8 since it is bounded by `max_group_size`
// when it is created above.
let value = FacetGroupValue { size: nbr_leftover_elements as u8, bitmap: values }; let value = FacetGroupValue { size: nbr_leftover_elements as u8, bitmap: values };
to_add.push((key.into_owned(), value)); to_add.push((key.into_owned(), value));
} }