From 8d0ace2d64aa37558c354249e6104e7d407f3a1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 24 Nov 2022 09:00:53 +0100 Subject: [PATCH] Avoid creating a MatchingWord for words that exceed the length limit --- milli/src/lib.rs | 15 ++++ milli/src/search/matches/matching_words.rs | 25 ++++-- milli/src/search/matches/mod.rs | 22 ++--- milli/src/search/query_tree.rs | 82 +++++++++++++------ .../extract/extract_docid_word_positions.rs | 8 +- .../extract/extract_facet_string_docids.rs | 3 +- .../extract/extract_fid_docid_facet_values.rs | 3 +- .../src/update/index_documents/helpers/mod.rs | 15 +--- 8 files changed, 111 insertions(+), 62 deletions(-) diff --git a/milli/src/lib.rs b/milli/src/lib.rs index c33aae9eb..40e36092f 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -70,6 +70,21 @@ pub type SmallVec8 = smallvec::SmallVec<[T; 8]>; /// expressed in term of latitude and longitude. pub type GeoPoint = rstar::primitives::GeomWithData<[f64; 3], (DocumentId, [f64; 2])>; +/// The maximum length a LMDB key can be. +/// +/// Note that the actual allowed length is a little bit higher, but +/// we keep a margin of safety. +const MAX_LMDB_KEY_LENGTH: usize = 500; + +/// The maximum length a field value can be when inserted in an LMDB key. +/// +/// This number is determined by the keys of the different facet databases +/// and adding a margin of safety. +pub const MAX_FACET_VALUE_LENGTH: usize = MAX_LMDB_KEY_LENGTH - 20; + +/// The maximum length a word can be +pub const MAX_WORD_LENGTH: usize = MAX_LMDB_KEY_LENGTH / 2; + pub const MAX_POSITION_PER_ATTRIBUTE: u32 = u16::MAX as u32 + 1; // Convert an absolute word position into a relative position. diff --git a/milli/src/search/matches/matching_words.rs b/milli/src/search/matches/matching_words.rs index 5bd6c222d..22ba973b5 100644 --- a/milli/src/search/matches/matching_words.rs +++ b/milli/src/search/matches/matching_words.rs @@ -8,6 +8,7 @@ use charabia::Token; use levenshtein_automata::{Distance, DFA}; use crate::search::build_dfa; +use crate::MAX_WORD_LENGTH; type IsPrefix = bool; @@ -18,6 +19,17 @@ pub struct MatchingWords { inner: Vec<(Vec>, Vec)>, } +impl fmt::Debug for MatchingWords { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + writeln!(f, "[")?; + for (matching_words, primitive_word_id) in self.inner.iter() { + writeln!(f, "({matching_words:?}, {primitive_word_id:?})")?; + } + writeln!(f, "]")?; + Ok(()) + } +} + impl MatchingWords { pub fn new(mut matching_words: Vec<(Vec>, Vec)>) -> Self { // Sort word by len in DESC order prioritizing the longuest matches, @@ -93,10 +105,13 @@ impl PartialEq for MatchingWord { } impl MatchingWord { - pub fn new(word: String, typo: u8, prefix: IsPrefix) -> Self { + pub fn new(word: String, typo: u8, prefix: IsPrefix) -> Option { + if word.len() > MAX_WORD_LENGTH { + return None; + } let dfa = build_dfa(&word, typo, prefix); - Self { dfa, word, typo, prefix } + Some(Self { dfa, word, typo, prefix }) } /// Returns the lenght in chars of the match in case of the token matches the term. @@ -335,9 +350,9 @@ mod tests { #[test] fn matching_words() { let all = vec![ - Rc::new(MatchingWord::new("split".to_string(), 1, true)), - Rc::new(MatchingWord::new("this".to_string(), 0, false)), - Rc::new(MatchingWord::new("world".to_string(), 1, true)), + Rc::new(MatchingWord::new("split".to_string(), 1, true).unwrap()), + Rc::new(MatchingWord::new("this".to_string(), 0, false).unwrap()), + Rc::new(MatchingWord::new("world".to_string(), 1, true).unwrap()), ]; let matching_words = vec![ (vec![all[0].clone()], vec![0]), diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs index 0e515fde6..25ee52ab1 100644 --- a/milli/src/search/matches/mod.rs +++ b/milli/src/search/matches/mod.rs @@ -503,9 +503,9 @@ mod tests { fn matching_words() -> MatchingWords { let all = vec![ - Rc::new(MatchingWord::new("split".to_string(), 0, false)), - Rc::new(MatchingWord::new("the".to_string(), 0, false)), - Rc::new(MatchingWord::new("world".to_string(), 1, true)), + Rc::new(MatchingWord::new("split".to_string(), 0, false).unwrap()), + Rc::new(MatchingWord::new("the".to_string(), 0, false).unwrap()), + Rc::new(MatchingWord::new("world".to_string(), 1, true).unwrap()), ]; let matching_words = vec![ (vec![all[0].clone()], vec![0]), @@ -595,8 +595,8 @@ mod tests { #[test] fn highlight_unicode() { let all = vec![ - Rc::new(MatchingWord::new("wessfali".to_string(), 1, true)), - Rc::new(MatchingWord::new("world".to_string(), 1, true)), + Rc::new(MatchingWord::new("wessfali".to_string(), 1, true).unwrap()), + Rc::new(MatchingWord::new("world".to_string(), 1, true).unwrap()), ]; let matching_words = vec![(vec![all[0].clone()], vec![0]), (vec![all[1].clone()], vec![1])]; @@ -832,12 +832,12 @@ mod tests { #[test] fn partial_matches() { let all = vec![ - Rc::new(MatchingWord::new("the".to_string(), 0, false)), - Rc::new(MatchingWord::new("t".to_string(), 0, false)), - Rc::new(MatchingWord::new("he".to_string(), 0, false)), - Rc::new(MatchingWord::new("door".to_string(), 0, false)), - Rc::new(MatchingWord::new("do".to_string(), 0, false)), - Rc::new(MatchingWord::new("or".to_string(), 0, false)), + Rc::new(MatchingWord::new("the".to_string(), 0, false).unwrap()), + Rc::new(MatchingWord::new("t".to_string(), 0, false).unwrap()), + Rc::new(MatchingWord::new("he".to_string(), 0, false).unwrap()), + Rc::new(MatchingWord::new("door".to_string(), 0, false).unwrap()), + Rc::new(MatchingWord::new("do".to_string(), 0, false).unwrap()), + Rc::new(MatchingWord::new("or".to_string(), 0, false).unwrap()), ]; let matching_words = vec![ (vec![all[0].clone()], vec![0]), diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index acb326022..74b244f9a 100755 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -550,21 +550,20 @@ struct MatchingWordCache { map: HashMap<(String, u8, bool), Rc>, } impl MatchingWordCache { - fn insert(&mut self, word: String, typo: u8, prefix: bool) -> Rc { - // Toggle the (un)commented code to switch between cached and non-cached - // implementations. - - // self.all.push(MatchingWord::new(word, typo, prefix)); - // self.all.len() - 1 + fn insert(&mut self, word: String, typo: u8, prefix: bool) -> Option> { match self.map.entry((word.clone(), typo, prefix)) { - Entry::Occupied(idx) => idx.get().clone(), + Entry::Occupied(idx) => Some(idx.get().clone()), Entry::Vacant(vacant) => { - let matching_word = Rc::new(MatchingWord::new(word, typo, prefix)); + let matching_word = Rc::new(MatchingWord::new(word, typo, prefix)?); self.all.push(matching_word.clone()); vacant.insert(matching_word.clone()); - matching_word + Some(matching_word) } } + // To deactivate the cache, for testing purposes, use the following instead: + // let matching_word = Rc::new(MatchingWord::new(word, typo, prefix)?); + // self.all.push(matching_word.clone()); + // Some(matching_word) } } @@ -591,16 +590,19 @@ fn create_matching_words( for synonym in synonyms { let synonym = synonym .into_iter() - .map(|syn| matching_word_cache.insert(syn, 0, false)) + .flat_map(|syn| matching_word_cache.insert(syn, 0, false)) .collect(); matching_words.push((synonym, vec![id])); } } if let Some((left, right)) = split_best_frequency(ctx, &word)? { - let left = matching_word_cache.insert(left.to_string(), 0, false); - let right = matching_word_cache.insert(right.to_string(), 0, false); - matching_words.push((vec![left, right], vec![id])); + if let Some(left) = matching_word_cache.insert(left.to_string(), 0, false) { + if let Some(right) = matching_word_cache.insert(right.to_string(), 0, false) + { + matching_words.push((vec![left, right], vec![id])); + } + } } let (word_len_one_typo, word_len_two_typo) = ctx.min_word_len_for_typo()?; @@ -614,7 +616,9 @@ fn create_matching_words( matching_word_cache.insert(word, typo, prefix) } }; - matching_words.push((vec![matching_word], vec![id])); + if let Some(matching_word) = matching_word { + matching_words.push((vec![matching_word], vec![id])); + } } // create a CONSECUTIVE matchings words wrapping all word in the phrase PrimitiveQueryPart::Phrase(words) => { @@ -623,7 +627,7 @@ fn create_matching_words( let words = words .into_iter() .flatten() - .map(|w| matching_word_cache.insert(w, 0, false)) + .flat_map(|w| matching_word_cache.insert(w, 0, false)) .collect(); matching_words.push((words, ids)); } @@ -681,7 +685,7 @@ fn create_matching_words( for synonym in synonyms { let synonym = synonym .into_iter() - .map(|syn| matching_word_cache.insert(syn, 0, false)) + .flat_map(|syn| matching_word_cache.insert(syn, 0, false)) .collect(); matching_words.push((synonym, ids.clone())); } @@ -704,7 +708,9 @@ fn create_matching_words( matching_word_cache.insert(word, typo, is_prefix) } }; - matching_words.push((vec![matching_word], ids)); + if let Some(matching_word) = matching_word { + matching_words.push((vec![matching_word], ids)); + } } } @@ -1341,6 +1347,27 @@ mod test { ); } + #[test] + fn test_dont_create_matching_word_for_long_words() { + let index = TempIndex::new(); + let rtxn = index.read_txn().unwrap(); + let query = "what a supercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocious house"; + let mut builder = QueryTreeBuilder::new(&rtxn, &index).unwrap(); + builder.words_limit(10); + let (_, _, matching_words) = builder.build(query.tokenize()).unwrap().unwrap(); + insta::assert_snapshot!(format!("{matching_words:?}"), @r###" + [ + ([MatchingWord { word: "house", typo: 1, prefix: true }], [3]) + ([MatchingWord { word: "house", typo: 1, prefix: true }], [2]) + ([MatchingWord { word: "whata", typo: 1, prefix: false }], [0, 1]) + ([MatchingWord { word: "house", typo: 1, prefix: true }], [2]) + ([MatchingWord { word: "house", typo: 1, prefix: true }], [1]) + ([MatchingWord { word: "what", typo: 0, prefix: false }], [0]) + ([MatchingWord { word: "a", typo: 0, prefix: false }], [1]) + ] + "###); + } + #[test] fn disable_typo_on_word() { let query = "goodbye"; @@ -1380,9 +1407,8 @@ mod test { } } - // This test must be run #[test] - fn ten_words() { + fn memory_usage_of_ten_word_query() { let resident_before = ALLOC.resident.load(atomic::Ordering::SeqCst); let allocated_before = ALLOC.allocated.load(atomic::Ordering::SeqCst); @@ -1395,12 +1421,20 @@ mod test { let resident_after = ALLOC.resident.load(atomic::Ordering::SeqCst); let allocated_after = ALLOC.allocated.load(atomic::Ordering::SeqCst); - insta::assert_snapshot!(format!("{}", resident_after - resident_before), @"4521710"); - insta::assert_snapshot!(format!("{}", allocated_after - allocated_before), @"7259092"); + // Weak check on the memory usage + // Don't keep more than 5MB. (Arguably 5MB is already too high) + assert!(resident_after - resident_before < 5_000_000); + // Don't allocate more than 10MB. + assert!(allocated_after - allocated_before < 10_000_000); - // Note, if the matching word cache is deactivated, the memory usage is: - // insta::assert_snapshot!(format!("{}", resident_after - resident_before), @"91311265"); - // insta::assert_snapshot!(format!("{}", allocated_after - allocated_before), @"125948410"); + // Use these snapshots to measure the exact memory usage. + // The values below were correct at the time I wrote them. + // insta::assert_snapshot!(format!("{}", resident_after - resident_before), @"4486950"); + // insta::assert_snapshot!(format!("{}", allocated_after - allocated_before), @"7107502"); + + // Note, with the matching word cache deactivated, the memory usage was: + // insta::assert_snapshot!(format!("{}", resident_after - resident_before), @"91248697"); + // insta::assert_snapshot!(format!("{}", allocated_after - allocated_before), @"125697588"); // or about 20x more resident memory (90MB vs 4.5MB) // Use x diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index 8eae0caee..be9b479bb 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -7,11 +7,11 @@ use charabia::{SeparatorKind, Token, TokenKind, TokenizerBuilder}; use roaring::RoaringBitmap; use serde_json::Value; -use super::helpers::{ - concat_u32s_array, create_sorter, sorter_into_reader, GrenadParameters, MAX_WORD_LENGTH, -}; +use super::helpers::{concat_u32s_array, create_sorter, sorter_into_reader, GrenadParameters}; use crate::error::{InternalError, SerializationError}; -use crate::{absolute_from_relative_position, FieldId, Result, MAX_POSITION_PER_ATTRIBUTE}; +use crate::{ + absolute_from_relative_position, FieldId, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH, +}; /// Extracts the word and positions where this word appear and /// prefixes it by the document id. diff --git a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs index 3a0af3c96..0d9c0981e 100644 --- a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs @@ -6,9 +6,8 @@ use heed::BytesEncode; use super::helpers::{create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters}; use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec}; use crate::heed_codec::StrRefCodec; -use crate::update::index_documents::helpers::MAX_FACET_VALUE_LENGTH; use crate::update::index_documents::merge_cbo_roaring_bitmaps; -use crate::{FieldId, Result}; +use crate::{FieldId, Result, MAX_FACET_VALUE_LENGTH}; /// Extracts the facet string and the documents ids where this facet string appear. /// diff --git a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs index b37cd90d3..0a7dfbeb1 100644 --- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs +++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs @@ -12,9 +12,8 @@ use serde_json::Value; use super::helpers::{create_sorter, keep_first, sorter_into_reader, GrenadParameters}; use crate::error::InternalError; use crate::facet::value_encoding::f64_into_bytes; -use crate::update::index_documents::helpers::MAX_FACET_VALUE_LENGTH; use crate::update::index_documents::{create_writer, writer_into_reader}; -use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result, BEU32}; +use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result, BEU32, MAX_FACET_VALUE_LENGTH}; /// Extracts the facet values of each faceted field of each document. /// diff --git a/milli/src/update/index_documents/helpers/mod.rs b/milli/src/update/index_documents/helpers/mod.rs index e1f112858..a496ccd6e 100644 --- a/milli/src/update/index_documents/helpers/mod.rs +++ b/milli/src/update/index_documents/helpers/mod.rs @@ -18,20 +18,7 @@ pub use merge_functions::{ serialize_roaring_bitmap, MergeFn, }; -/// The maximum length a LMDB key can be. -/// -/// Note that the actual allowed length is a little bit higher, but -/// we keep a margin of safety. -const MAX_LMDB_KEY_LENGTH: usize = 500; - -/// The maximum length a field value can be when inserted in an LMDB key. -/// -/// This number is determined by the keys of the different facet databases -/// and adding a margin of safety. -pub const MAX_FACET_VALUE_LENGTH: usize = MAX_LMDB_KEY_LENGTH - 20; - -/// The maximum length a word can be -pub const MAX_WORD_LENGTH: usize = MAX_LMDB_KEY_LENGTH / 2; +use crate::MAX_WORD_LENGTH; pub fn valid_lmdb_key(key: impl AsRef<[u8]>) -> bool { key.as_ref().len() <= MAX_WORD_LENGTH * 2 && !key.as_ref().is_empty()