From d95d02cb8a54937b71a8d80cf76a2257b7017041 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 14 Nov 2022 14:16:14 +0100 Subject: [PATCH 1/4] Fix Facet Indexing bugs 1. Handle keys with variable length correctly This fixes https://github.com/meilisearch/meilisearch/issues/3042 and is easily reproducible with the updated fuzz tests, which now generate keys with variable lengths. 2. Prevent adding facets to the database if their encoded value does not satisfy `valid_lmdb_key`. This fixes an indexing failure when a document had a filterable attribute containing a value whose length is higher than ~500 bytes. --- milli/src/update/facet/bulk.rs | 8 +- milli/src/update/facet/incremental.rs | 302 ++++-------------- .../extract/extract_facet_number_docids.rs | 1 - .../extract/extract_facet_string_docids.rs | 9 +- 4 files changed, 66 insertions(+), 254 deletions(-) diff --git a/milli/src/update/facet/bulk.rs b/milli/src/update/facet/bulk.rs index 317a7af9b..30660d5af 100644 --- a/milli/src/update/facet/bulk.rs +++ b/milli/src/update/facet/bulk.rs @@ -12,7 +12,7 @@ use crate::heed_codec::facet::{ FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, }; use crate::heed_codec::ByteSliceRefCodec; -use crate::update::index_documents::{create_writer, writer_into_reader}; +use crate::update::index_documents::{create_writer, valid_lmdb_key, writer_into_reader}; use crate::{CboRoaringBitmapCodec, FieldId, Index, Result}; /// Algorithm to insert elememts into the `facet_id_(string/f64)_docids` databases @@ -142,6 +142,9 @@ impl FacetsUpdateBulkInner { let mut database = self.db.iter_mut(wtxn)?.remap_types::(); let mut cursor = new_data.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { + if !valid_lmdb_key(key) { + continue; + } buffer.clear(); // the group size for level 0 buffer.push(1); @@ -155,6 +158,9 @@ impl FacetsUpdateBulkInner { let mut cursor = new_data.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { + if !valid_lmdb_key(key) { + continue; + } // the value is a CboRoaringBitmap, but I still need to prepend the // group size for level 0 (= 1) to it buffer.clear(); diff --git a/milli/src/update/facet/incremental.rs b/milli/src/update/facet/incremental.rs index fd253b146..b07b675c5 100644 --- a/milli/src/update/facet/incremental.rs +++ b/milli/src/update/facet/incremental.rs @@ -11,6 +11,7 @@ use crate::heed_codec::facet::{ }; use crate::heed_codec::ByteSliceRefCodec; use crate::search::facet::get_highest_level; +use crate::update::index_documents::valid_lmdb_key; use crate::{CboRoaringBitmapCodec, FieldId, Index, Result}; enum InsertionResult { @@ -70,6 +71,9 @@ impl<'i> FacetsUpdateIncremental<'i> { let mut cursor = self.new_data.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { + if !valid_lmdb_key(key) { + continue; + } let key = FacetGroupKeyCodec::::bytes_decode(key) .ok_or(heed::Error::Encoding)?; let docids = CboRoaringBitmapCodec::bytes_decode(value).ok_or(heed::Error::Encoding)?; @@ -114,52 +118,37 @@ impl FacetsUpdateIncrementalInner { txn: &RoTxn, ) -> Result<(FacetGroupKey>, FacetGroupValue)> { assert!(level > 0); + match self.db.get_lower_than_or_equal_to( + txn, + &FacetGroupKey { field_id, level, left_bound: facet_value }, + )? { + Some((key, value)) => { + if key.level != level { + let mut prefix = vec![]; + prefix.extend_from_slice(&field_id.to_be_bytes()); + prefix.push(level); - let mut prefix = vec![]; - prefix.extend_from_slice(&field_id.to_be_bytes()); - prefix.push(level); - prefix.extend_from_slice(facet_value); - - let mut prefix_iter = self - .db - .as_polymorph() - .prefix_iter::<_, ByteSlice, FacetGroupValueCodec>(txn, prefix.as_slice())?; - if let Some(e) = prefix_iter.next() { - let (key_bytes, value) = e?; - Ok(( - FacetGroupKeyCodec::::bytes_decode(key_bytes) - .ok_or(Error::Encoding)? - .into_owned(), - value, - )) - } else { - let key = FacetGroupKey { field_id, level, left_bound: facet_value }; - match self.db.get_lower_than(txn, &key)? { - Some((key, value)) => { - if key.level != level { - let mut prefix = vec![]; - prefix.extend_from_slice(&field_id.to_be_bytes()); - prefix.push(level); - - let mut iter = self - .db - .as_polymorph() - .prefix_iter::<_, ByteSlice, FacetGroupValueCodec>( - txn, - prefix.as_slice(), - )?; - let (key_bytes, value) = iter.next().unwrap()?; - Ok(( - FacetGroupKeyCodec::::bytes_decode(key_bytes) - .ok_or(Error::Encoding)? - .into_owned(), - value, - )) - } else { - Ok((key.into_owned(), value)) - } + let mut iter = + self.db.as_polymorph().prefix_iter::<_, ByteSlice, FacetGroupValueCodec>( + txn, + &prefix.as_slice(), + )?; + let (key_bytes, value) = iter.next().unwrap()?; + Ok(( + FacetGroupKeyCodec::::bytes_decode(&key_bytes) + .ok_or(Error::Encoding)? + .into_owned(), + value, + )) + } else { + Ok((key.into_owned(), value)) } - None => panic!(), + } + None => { + // We checked that the level is > 0 + // Since all keys of level 1 are greater than those of level 0, + // we are guaranteed that db.get_lower_than_or_equal_to(key) exists + panic!() } } } @@ -1050,9 +1039,7 @@ ensures that: 2. its content is the same as a trivially correct implementation of the same database */ mod fuzz { - use std::borrow::Cow; use std::collections::{BTreeMap, HashMap}; - use std::convert::TryFrom; use std::iter::FromIterator; use std::rc::Rc; @@ -1060,49 +1047,30 @@ mod fuzz { use fuzzcheck::mutators::integer_within_range::{U16WithinRangeMutator, U8WithinRangeMutator}; use fuzzcheck::mutators::vector::VecMutator; use fuzzcheck::DefaultMutator; - use heed::BytesEncode; use roaring::RoaringBitmap; use tempfile::TempDir; use super::*; use crate::update::facet::tests::FacetIndex; - - struct NEU16Codec; - impl<'a> BytesEncode<'a> for NEU16Codec { - type EItem = u16; - #[no_coverage] - fn bytes_encode(item: &'a Self::EItem) -> Option> { - Some(Cow::Owned(item.to_be_bytes().to_vec())) - } - } - impl<'a> BytesDecode<'a> for NEU16Codec { - type DItem = u16; - #[no_coverage] - fn bytes_decode(bytes: &'a [u8]) -> Option { - let bytes = <[u8; 2]>::try_from(&bytes[0..=1]).unwrap(); - Some(u16::from_be_bytes(bytes)) - } - } - #[derive(Default)] pub struct TrivialDatabase { pub elements: BTreeMap>, } impl TrivialDatabase where - T: Ord + Clone + Copy + Eq + std::fmt::Debug, + T: Ord + Clone + Eq + std::fmt::Debug, { #[no_coverage] - pub fn insert(&mut self, field_id: u16, new_key: T, new_values: &RoaringBitmap) { + pub fn insert(&mut self, field_id: u16, new_key: &T, new_values: &RoaringBitmap) { if new_values.is_empty() { return; } let values_field_id = self.elements.entry(field_id).or_default(); - let values = values_field_id.entry(new_key).or_default(); + let values = values_field_id.entry(new_key.clone()).or_default(); *values |= new_values; } #[no_coverage] - pub fn delete(&mut self, field_id: u16, key: T, values_to_remove: &RoaringBitmap) { + pub fn delete(&mut self, field_id: u16, key: &T, values_to_remove: &RoaringBitmap) { if let Some(values_field_id) = self.elements.get_mut(&field_id) { if let Some(values) = values_field_id.get_mut(&key) { *values -= values_to_remove; @@ -1117,8 +1085,9 @@ mod fuzz { } } #[derive(Clone, DefaultMutator, serde::Serialize, serde::Deserialize)] - struct Operation { - key: Key, + struct Operation { + #[field_mutator(VecMutator = { VecMutator::new(u8::default_mutator(), 0 ..= 5) })] + key: Vec, #[field_mutator(U8WithinRangeMutator = { U8WithinRangeMutator::new(..32) })] group_size: u8, #[field_mutator(U8WithinRangeMutator = { U8WithinRangeMutator::new(..32) })] @@ -1142,13 +1111,12 @@ mod fuzz { } #[no_coverage] - fn compare_with_trivial_database(tempdir: Rc, operations: &[Operation]) { - let index = FacetIndex::::open_from_tempdir(tempdir, 4, 8, 5); // dummy params, they'll be overwritten - // let mut txn = index.env.write_txn().unwrap(); + fn compare_with_trivial_database(tempdir: Rc, operations: &[Operation]) { + let index = FacetIndex::::open_from_tempdir(tempdir, 4, 8, 5); // dummy params, they'll be overwritten let mut txn = index.env.write_txn().unwrap(); - let mut trivial_db = TrivialDatabase::::default(); - let mut value_to_keys = HashMap::>::new(); + let mut trivial_db = TrivialDatabase::>::default(); + let mut value_to_keys = HashMap::>>::new(); for Operation { key, group_size, max_group_size, min_level_size, field_id, kind } in operations { @@ -1160,10 +1128,10 @@ mod fuzz { let mut bitmap = RoaringBitmap::new(); for value in values { bitmap.insert(*value as u32); - value_to_keys.entry(*value).or_default().push(*key); + value_to_keys.entry(*value).or_default().push(key.clone()); } - index.insert(&mut txn, *field_id, key, &bitmap); - trivial_db.insert(*field_id, *key, &bitmap); + index.insert(&mut txn, *field_id, &key.as_slice(), &bitmap); + trivial_db.insert(*field_id, &key, &bitmap); } OperationKind::Delete(values) => { let values = RoaringBitmap::from_iter(values.iter().copied().map(|x| x as u32)); @@ -1179,8 +1147,8 @@ mod fuzz { } } for (key, values) in values_per_key { - index.delete(&mut txn, *field_id, &key, &values); - trivial_db.delete(*field_id, *key, &values); + index.delete(&mut txn, *field_id, &key.as_slice(), &values); + trivial_db.delete(*field_id, &key, &values); } } } @@ -1198,7 +1166,8 @@ mod fuzz { for ((key, values), group) in values_field_id.iter().zip(level0iter) { let (group_key, group_values) = group.unwrap(); - let group_key = FacetGroupKeyCodec::::bytes_decode(group_key).unwrap(); + let group_key = + FacetGroupKeyCodec::::bytes_decode(group_key).unwrap(); assert_eq!(key, &group_key.left_bound); assert_eq!(values, &group_values.bitmap); } @@ -1213,7 +1182,8 @@ mod fuzz { for ((key, values), group) in values_field_id.iter().zip(level0iter) { let (group_key, group_values) = group.unwrap(); - let group_key = FacetGroupKeyCodec::::bytes_decode(group_key).unwrap(); + let group_key = + FacetGroupKeyCodec::::bytes_decode(group_key).unwrap(); assert_eq!(key, &group_key.left_bound); assert_eq!(values, &group_values.bitmap); } @@ -1227,7 +1197,7 @@ mod fuzz { fn fuzz() { let tempdir = Rc::new(TempDir::new().unwrap()); let tempdir_cloned = tempdir.clone(); - let result = fuzzcheck::fuzz_test(move |operations: &[Operation]| { + let result = fuzzcheck::fuzz_test(move |operations: &[Operation]| { compare_with_trivial_database(tempdir_cloned.clone(), operations) }) .default_mutator() @@ -1243,168 +1213,4 @@ mod fuzz { .launch(); assert!(!result.found_test_failure); } - - #[test] - #[no_coverage] - fn reproduce_bug1() { - let operations = r#" - [ - {"key":0, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[109]}}, - {"key":143, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[243]}}, - {"key":90, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[217]}}, - {"key":172, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[94]}}, - {"key":27, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[4]}}, - {"key":124, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[0]}}, - {"key":123, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[0]}}, - {"key":67, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[109]}}, - {"key":13, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[0]}}, - {"key":162, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[213]}}, - {"key":235, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[67]}}, - {"key":251, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[50]}}, - {"key":218, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[164]}}, - {"key":166, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[67]}}, - {"key":64, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[61]}}, - {"key":183, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[210]}}, - {"key":250, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Delete":[50]}} - ] - "#; - let operations: Vec> = serde_json::from_str(operations).unwrap(); - let tempdir = TempDir::new().unwrap(); - compare_with_trivial_database(Rc::new(tempdir), &operations); - } - - #[test] - #[no_coverage] - fn reproduce_bug2() { - let operations = r#" - [ - {"key":102, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[122]}}, - {"key":73, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[132]}}, - {"key":20, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[215]}}, - {"key":39, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[152]}}, - {"key":151, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[226]}}, - {"key":17, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[101]}}, - {"key":74, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[210]}}, - {"key":2, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[130]}}, - {"key":64, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[180]}}, - {"key":83, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[250]}}, - {"key":80, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[210]}}, - {"key":113, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[63]}}, - {"key":201, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[210]}}, - {"key":200, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[5]}}, - {"key":93, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[98]}}, - {"key":162, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[5]}}, - {"key":80, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Delete":[210]}} - ] - "#; - let operations: Vec> = serde_json::from_str(operations).unwrap(); - let tempdir = TempDir::new().unwrap(); - compare_with_trivial_database(Rc::new(tempdir), &operations); - } - #[test] - #[no_coverage] - fn reproduce_bug3() { - let operations = r#" - [ - {"key":27488, "field_id": 0, "group_size":0, "max_group_size":7, "min_level_size":0, "kind":{"Insert":[206]}}, - {"key":64716, "field_id": 0, "group_size":0, "max_group_size":7, "min_level_size":0, "kind":{"Insert":[216]}}, - {"key":60886, "field_id": 0, "group_size":0, "max_group_size":7, "min_level_size":0, "kind":{"Insert":[206]}}, - {"key":59509, "field_id": 0, "group_size":0, "max_group_size":7, "min_level_size":0, "kind":{"Insert":[187,231]}}, - {"key":55057, "field_id": 0, "group_size":0, "max_group_size":7, "min_level_size":0, "kind":{"Insert":[37]}}, - {"key":45200, "field_id": 0, "group_size":0, "max_group_size":7, "min_level_size":0, "kind":{"Insert":[206]}}, - {"key":55056, "field_id": 0, "group_size":0, "max_group_size":7, "min_level_size":0, "kind":{"Insert":[37]}}, - {"key":63679, "field_id": 0, "group_size":0, "max_group_size":7, "min_level_size":0, "kind":{"Insert":[206]}}, - {"key":52155, "field_id": 0, "group_size":0, "max_group_size":7, "min_level_size":0, "kind":{"Insert":[74]}}, - {"key":20648, "field_id": 0, "group_size":0, "max_group_size":7, "min_level_size":0, "kind":{"Insert":[47,138,157]}} - ] - "#; - let operations: Vec> = serde_json::from_str(operations).unwrap(); - let tempdir = TempDir::new().unwrap(); - compare_with_trivial_database(Rc::new(tempdir), &operations); - } - - #[test] - #[no_coverage] - fn reproduce_bug4() { - let operations = r#"[ - {"key":63499, "field_id": 0, "group_size":2, "max_group_size":1, "min_level_size":0, "kind":{"Insert":[87]}}, - {"key":25374, "field_id": 0, "group_size":2, "max_group_size":1, "min_level_size":0, "kind":{"Insert":[14]}}, - {"key":64481, "field_id": 0, "group_size":2, "max_group_size":1, "min_level_size":0, "kind":{"Delete":[87]}}, - {"key":23038, "field_id": 0, "group_size":2, "max_group_size":1, "min_level_size":0, "kind":{"Insert":[173]}}, - {"key":14862, "field_id": 0, "group_size":2, "max_group_size":1, "min_level_size":0, "kind":{"Insert":[8]}}, - {"key":13145, "field_id": 0, "group_size":2, "max_group_size":1, "min_level_size":0, "kind":{"Insert":[5,64]}}, - {"key":23446, "field_id": 0, "group_size":2, "max_group_size":1, "min_level_size":0, "kind":{"Insert":[86,59]}}, - {"key":17972, "field_id": 0, "group_size":2, "max_group_size":1, "min_level_size":0, "kind":{"Insert":[58,137]}}, - {"key":21273, "field_id": 0, "group_size":2, "max_group_size":1, "min_level_size":0, "kind":{"Insert":[121,132,81,147]}}, - {"key":28264, "field_id": 0, "group_size":2, "max_group_size":1, "min_level_size":0, "kind":{"Insert":[36]}}, - {"key":46659, "field_id": 0, "group_size":2, "max_group_size":1, "min_level_size":0, "kind":{"Insert":[]}} - ] - "#; - let operations: Vec> = serde_json::from_str(operations).unwrap(); - let tempdir = TempDir::new().unwrap(); - compare_with_trivial_database(Rc::new(tempdir), &operations); - } - - #[test] - #[no_coverage] - fn reproduce_bug5() { - let input = r#" - [ - { - "key":3438, - "group_size":11, - "max_group_size":0, - "min_level_size":17, - "field_id":3, - "kind":{"Insert":[198]} - }, - - { - "key":47098, - "group_size":0, - "max_group_size":8, - "min_level_size":0, - "field_id":3, - "kind":{"Insert":[11]} - }, - { - "key":22453, - "group_size":0, - "max_group_size":0, - "min_level_size":0, - "field_id":3, - "kind":{"Insert":[145]} - }, - { - "key":14105, - "group_size":14, - "max_group_size":4, - "min_level_size":25, - "field_id":3, - "kind":{"Delete":[11]} - } - ] - "#; - let operations: Vec> = serde_json::from_str(input).unwrap(); - let tmpdir = TempDir::new().unwrap(); - compare_with_trivial_database(Rc::new(tmpdir), &operations); - } - - #[test] - #[no_coverage] - fn reproduce_bug6() { - let input = r#" - [ - {"key":45720,"group_size":1,"max_group_size":4,"min_level_size":0,"field_id":0,"kind":{"Insert":[120]}}, - {"key":37463,"group_size":1,"max_group_size":4,"min_level_size":0,"field_id":0,"kind":{"Insert":[187]}}, - {"key":21512,"group_size":23,"max_group_size":20,"min_level_size":23,"field_id":0,"kind":{"Insert":[181]}}, - {"key":21511,"group_size":23,"max_group_size":20,"min_level_size":23,"field_id":0,"kind":{"Insert":[181]}}, - {"key":37737,"group_size":12,"max_group_size":0,"min_level_size":6,"field_id":0,"kind":{"Insert":[181]}}, - {"key":53042,"group_size":23,"max_group_size":20,"min_level_size":23,"field_id":0,"kind":{"Insert":[181]}} - ] - "#; - let operations: Vec> = serde_json::from_str(input).unwrap(); - let tmpdir = TempDir::new().unwrap(); - compare_with_trivial_database(Rc::new(tmpdir), &operations); - } } diff --git a/milli/src/update/index_documents/extract/extract_facet_number_docids.rs b/milli/src/update/index_documents/extract/extract_facet_number_docids.rs index 1d415166d..33dd5ce5b 100644 --- a/milli/src/update/index_documents/extract/extract_facet_number_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_number_docids.rs @@ -38,7 +38,6 @@ pub fn extract_facet_number_docids( let key = FacetGroupKey { field_id, level: 0, left_bound: number }; let key_bytes = FacetGroupKeyCodec::::bytes_encode(&key).unwrap(); - facet_number_docids_sorter.insert(key_bytes, document_id.to_ne_bytes())?; } diff --git a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs index 182538683..8b02a6008 100644 --- a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs @@ -6,7 +6,7 @@ use heed::BytesEncode; use super::helpers::{create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters}; use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec}; use crate::heed_codec::StrRefCodec; -use crate::update::index_documents::merge_cbo_roaring_bitmaps; +use crate::update::index_documents::{merge_cbo_roaring_bitmaps, valid_lmdb_key}; use crate::{FieldId, Result}; /// Extracts the facet string and the documents ids where this facet string appear. @@ -41,9 +41,10 @@ pub fn extract_facet_string_docids( let normalised_value = std::str::from_utf8(normalized_value_bytes)?; let key = FacetGroupKey { field_id, level: 0, left_bound: normalised_value }; let key_bytes = FacetGroupKeyCodec::::bytes_encode(&key).unwrap(); - - // document id is encoded in native-endian because of the CBO roaring bitmap codec - facet_string_docids_sorter.insert(&key_bytes, document_id.to_ne_bytes())?; + if valid_lmdb_key(&key_bytes) { + // document id is encoded in native-endian because of the CBO roaring bitmap codec + facet_string_docids_sorter.insert(&key_bytes, document_id.to_ne_bytes())?; + } } sorter_into_reader(facet_string_docids_sorter, indexer) From 990a8612413cd225c5088d86557e574fa1077089 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 16 Nov 2022 15:19:55 +0100 Subject: [PATCH 2/4] Add test for indexing a document with a long facet value --- milli/src/update/index_documents/mod.rs | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index e9f5c2d38..af99a230b 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -1821,4 +1821,24 @@ mod tests { let words_fst = index.words_fst(&rtxn).unwrap(); assert!(!words_fst.contains(&long_word)); } + + #[test] + fn long_facet_values_must_not_crash() { + let index = TempIndex::new(); + + // this is obviousy too long + let long_word = "lol".repeat(1000); + let doc1 = documents! {[{ + "id": "1", + "title": long_word, + }]}; + + index + .update_settings(|settings| { + settings.set_filterable_fields(hashset! { S("title") }); + }) + .unwrap(); + + index.add_documents(doc1).unwrap(); + } } From ac3baafbe85914a4020eae06f43f6b8394eff5ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 16 Nov 2022 14:03:27 +0100 Subject: [PATCH 3/4] Truncate facet values that are too long before indexing them --- .../extract/extract_facet_string_docids.rs | 21 +++++++++++++------ .../extract/extract_fid_docid_facet_values.rs | 11 ++++++++-- .../src/update/index_documents/helpers/mod.rs | 14 ++++++++++++- 3 files changed, 37 insertions(+), 9 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs index 8b02a6008..3a0af3c96 100644 --- a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs @@ -6,7 +6,8 @@ use heed::BytesEncode; use super::helpers::{create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters}; use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec}; use crate::heed_codec::StrRefCodec; -use crate::update::index_documents::{merge_cbo_roaring_bitmaps, valid_lmdb_key}; +use crate::update::index_documents::helpers::MAX_FACET_VALUE_LENGTH; +use crate::update::index_documents::merge_cbo_roaring_bitmaps; use crate::{FieldId, Result}; /// Extracts the facet string and the documents ids where this facet string appear. @@ -38,13 +39,21 @@ pub fn extract_facet_string_docids( try_split_array_at::<_, 4>(bytes).unwrap(); let document_id = u32::from_be_bytes(document_id_bytes); - let normalised_value = std::str::from_utf8(normalized_value_bytes)?; + let mut normalised_value = std::str::from_utf8(normalized_value_bytes)?; + + let normalised_truncated_value: String; + if normalised_value.len() > MAX_FACET_VALUE_LENGTH { + normalised_truncated_value = normalised_value + .char_indices() + .take_while(|(idx, _)| idx + 4 < MAX_FACET_VALUE_LENGTH) + .map(|(_, c)| c) + .collect(); + normalised_value = normalised_truncated_value.as_str(); + } let key = FacetGroupKey { field_id, level: 0, left_bound: normalised_value }; let key_bytes = FacetGroupKeyCodec::::bytes_encode(&key).unwrap(); - if valid_lmdb_key(&key_bytes) { - // document id is encoded in native-endian because of the CBO roaring bitmap codec - facet_string_docids_sorter.insert(&key_bytes, document_id.to_ne_bytes())?; - } + // document id is encoded in native-endian because of the CBO roaring bitmap codec + facet_string_docids_sorter.insert(&key_bytes, document_id.to_ne_bytes())?; } sorter_into_reader(facet_string_docids_sorter, indexer) diff --git a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs index 44afcde6c..b37cd90d3 100644 --- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs +++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs @@ -12,6 +12,7 @@ use serde_json::Value; use super::helpers::{create_sorter, keep_first, sorter_into_reader, GrenadParameters}; use crate::error::InternalError; use crate::facet::value_encoding::f64_into_bytes; +use crate::update::index_documents::helpers::MAX_FACET_VALUE_LENGTH; use crate::update::index_documents::{create_writer, writer_into_reader}; use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result, BEU32}; @@ -85,10 +86,16 @@ pub fn extract_fid_docid_facet_values( } } - // insert normalized and original facet string in sorter + // insert normalized and original facet string in sorter for (normalized, original) in strings.into_iter().filter(|(n, _)| !n.is_empty()) { + let normalised_truncated_value: String = normalized + .char_indices() + .take_while(|(idx, _)| idx + 4 < MAX_FACET_VALUE_LENGTH) + .map(|(_, c)| c) + .collect(); + key_buffer.truncate(size_of::() + size_of::()); - key_buffer.extend_from_slice(normalized.as_bytes()); + key_buffer.extend_from_slice(normalised_truncated_value.as_bytes()); fid_docid_facet_strings_sorter.insert(&key_buffer, original.as_bytes())?; } } diff --git a/milli/src/update/index_documents/helpers/mod.rs b/milli/src/update/index_documents/helpers/mod.rs index 8fb629cae..e1f112858 100644 --- a/milli/src/update/index_documents/helpers/mod.rs +++ b/milli/src/update/index_documents/helpers/mod.rs @@ -18,8 +18,20 @@ pub use merge_functions::{ serialize_roaring_bitmap, MergeFn, }; +/// The maximum length a LMDB key can be. +/// +/// Note that the actual allowed length is a little bit higher, but +/// we keep a margin of safety. +const MAX_LMDB_KEY_LENGTH: usize = 500; + +/// The maximum length a field value can be when inserted in an LMDB key. +/// +/// This number is determined by the keys of the different facet databases +/// and adding a margin of safety. +pub const MAX_FACET_VALUE_LENGTH: usize = MAX_LMDB_KEY_LENGTH - 20; + /// The maximum length a word can be -pub const MAX_WORD_LENGTH: usize = 250; +pub const MAX_WORD_LENGTH: usize = MAX_LMDB_KEY_LENGTH / 2; pub fn valid_lmdb_key(key: impl AsRef<[u8]>) -> bool { key.as_ref().len() <= MAX_WORD_LENGTH * 2 && !key.as_ref().is_empty() From 0caadedd3b9e74d18905715f74afc6a5b0bd4544 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 17 Nov 2022 12:17:53 +0100 Subject: [PATCH 4/4] Make clippy happy --- milli/src/update/facet/incremental.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/milli/src/update/facet/incremental.rs b/milli/src/update/facet/incremental.rs index b07b675c5..ddf55b06c 100644 --- a/milli/src/update/facet/incremental.rs +++ b/milli/src/update/facet/incremental.rs @@ -131,11 +131,11 @@ impl FacetsUpdateIncrementalInner { let mut iter = self.db.as_polymorph().prefix_iter::<_, ByteSlice, FacetGroupValueCodec>( txn, - &prefix.as_slice(), + prefix.as_slice(), )?; let (key_bytes, value) = iter.next().unwrap()?; Ok(( - FacetGroupKeyCodec::::bytes_decode(&key_bytes) + FacetGroupKeyCodec::::bytes_decode(key_bytes) .ok_or(Error::Encoding)? .into_owned(), value,