From 8d0ace2d64aa37558c354249e6104e7d407f3a1f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= <loic.lecrenier@me.com>
Date: Thu, 24 Nov 2022 09:00:53 +0100
Subject: [PATCH] Avoid creating a MatchingWord for words that exceed the
 length limit

---
 milli/src/lib.rs                              | 15 ++++
 milli/src/search/matches/matching_words.rs    | 25 ++++--
 milli/src/search/matches/mod.rs               | 22 ++---
 milli/src/search/query_tree.rs                | 82 +++++++++++++------
 .../extract/extract_docid_word_positions.rs   |  8 +-
 .../extract/extract_facet_string_docids.rs    |  3 +-
 .../extract/extract_fid_docid_facet_values.rs |  3 +-
 .../src/update/index_documents/helpers/mod.rs | 15 +---
 8 files changed, 111 insertions(+), 62 deletions(-)
diff --git a/milli/src/lib.rs b/milli/src/lib.rs
index c33aae9eb..40e36092f 100644
--- a/milli/src/lib.rs
+++ b/milli/src/lib.rs
@@ -70,6 +70,21 @@ pub type SmallVec8<T> = smallvec::SmallVec<[T; 8]>;
 /// expressed in term of latitude and longitude.
 pub type GeoPoint = rstar::primitives::GeomWithData<[f64; 3], (DocumentId, [f64; 2])>;
 
+/// The maximum length a LMDB key can be.
+///
+/// Note that the actual allowed length is a little bit higher, but
+/// we keep a margin of safety.
+const MAX_LMDB_KEY_LENGTH: usize = 500;
+
+/// The maximum length a field value can be when inserted in an LMDB key.
+///
+/// This number is determined by the keys of the different facet databases
+/// and adding a margin of safety.
+pub const MAX_FACET_VALUE_LENGTH: usize = MAX_LMDB_KEY_LENGTH - 20;
+
+/// The maximum length a word can be
+pub const MAX_WORD_LENGTH: usize = MAX_LMDB_KEY_LENGTH / 2;
+
 pub const MAX_POSITION_PER_ATTRIBUTE: u32 = u16::MAX as u32 + 1;
 
 // Convert an absolute word position into a relative position.
diff --git a/milli/src/search/matches/matching_words.rs b/milli/src/search/matches/matching_words.rs
index 5bd6c222d..22ba973b5 100644
--- a/milli/src/search/matches/matching_words.rs
+++ b/milli/src/search/matches/matching_words.rs
@@ -8,6 +8,7 @@ use charabia::Token;
 use levenshtein_automata::{Distance, DFA};
 
 use crate::search::build_dfa;
+use crate::MAX_WORD_LENGTH;
 
 type IsPrefix = bool;
 
@@ -18,6 +19,17 @@ pub struct MatchingWords {
     inner: Vec<(Vec<Rc<MatchingWord>>, Vec<PrimitiveWordId>)>,
 }
 
+impl fmt::Debug for MatchingWords {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        writeln!(f, "[")?;
+        for (matching_words, primitive_word_id) in self.inner.iter() {
+            writeln!(f, "({matching_words:?}, {primitive_word_id:?})")?;
+        }
+        writeln!(f, "]")?;
+        Ok(())
+    }
+}
+
 impl MatchingWords {
     pub fn new(mut matching_words: Vec<(Vec<Rc<MatchingWord>>, Vec<PrimitiveWordId>)>) -> Self {
         // Sort word by len in DESC order prioritizing the longuest matches,
@@ -93,10 +105,13 @@ impl PartialEq for MatchingWord {
 }
 
 impl MatchingWord {
-    pub fn new(word: String, typo: u8, prefix: IsPrefix) -> Self {
+    pub fn new(word: String, typo: u8, prefix: IsPrefix) -> Option<Self> {
+        if word.len() > MAX_WORD_LENGTH {
+            return None;
+        }
         let dfa = build_dfa(&word, typo, prefix);
 
-        Self { dfa, word, typo, prefix }
+        Some(Self { dfa, word, typo, prefix })
     }
 
     /// Returns the lenght in chars of the match in case of the token matches the term.
@@ -335,9 +350,9 @@ mod tests {
     #[test]
     fn matching_words() {
         let all = vec![
-            Rc::new(MatchingWord::new("split".to_string(), 1, true)),
-            Rc::new(MatchingWord::new("this".to_string(), 0, false)),
-            Rc::new(MatchingWord::new("world".to_string(), 1, true)),
+            Rc::new(MatchingWord::new("split".to_string(), 1, true).unwrap()),
+            Rc::new(MatchingWord::new("this".to_string(), 0, false).unwrap()),
+            Rc::new(MatchingWord::new("world".to_string(), 1, true).unwrap()),
         ];
         let matching_words = vec![
             (vec![all[0].clone()], vec![0]),
diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs
index 0e515fde6..25ee52ab1 100644
--- a/milli/src/search/matches/mod.rs
+++ b/milli/src/search/matches/mod.rs
@@ -503,9 +503,9 @@ mod tests {
 
     fn matching_words() -> MatchingWords {
         let all = vec![
-            Rc::new(MatchingWord::new("split".to_string(), 0, false)),
-            Rc::new(MatchingWord::new("the".to_string(), 0, false)),
-            Rc::new(MatchingWord::new("world".to_string(), 1, true)),
+            Rc::new(MatchingWord::new("split".to_string(), 0, false).unwrap()),
+            Rc::new(MatchingWord::new("the".to_string(), 0, false).unwrap()),
+            Rc::new(MatchingWord::new("world".to_string(), 1, true).unwrap()),
         ];
         let matching_words = vec![
             (vec![all[0].clone()], vec![0]),
@@ -595,8 +595,8 @@ mod tests {
     #[test]
     fn highlight_unicode() {
         let all = vec![
-            Rc::new(MatchingWord::new("wessfali".to_string(), 1, true)),
-            Rc::new(MatchingWord::new("world".to_string(), 1, true)),
+            Rc::new(MatchingWord::new("wessfali".to_string(), 1, true).unwrap()),
+            Rc::new(MatchingWord::new("world".to_string(), 1, true).unwrap()),
         ];
         let matching_words = vec![(vec![all[0].clone()], vec![0]), (vec![all[1].clone()], vec![1])];
 
@@ -832,12 +832,12 @@ mod tests {
     #[test]
     fn partial_matches() {
         let all = vec![
-            Rc::new(MatchingWord::new("the".to_string(), 0, false)),
-            Rc::new(MatchingWord::new("t".to_string(), 0, false)),
-            Rc::new(MatchingWord::new("he".to_string(), 0, false)),
-            Rc::new(MatchingWord::new("door".to_string(), 0, false)),
-            Rc::new(MatchingWord::new("do".to_string(), 0, false)),
-            Rc::new(MatchingWord::new("or".to_string(), 0, false)),
+            Rc::new(MatchingWord::new("the".to_string(), 0, false).unwrap()),
+            Rc::new(MatchingWord::new("t".to_string(), 0, false).unwrap()),
+            Rc::new(MatchingWord::new("he".to_string(), 0, false).unwrap()),
+            Rc::new(MatchingWord::new("door".to_string(), 0, false).unwrap()),
+            Rc::new(MatchingWord::new("do".to_string(), 0, false).unwrap()),
+            Rc::new(MatchingWord::new("or".to_string(), 0, false).unwrap()),
         ];
         let matching_words = vec![
             (vec![all[0].clone()], vec![0]),
diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs
index acb326022..74b244f9a 100755
--- a/milli/src/search/query_tree.rs
+++ b/milli/src/search/query_tree.rs
@@ -550,21 +550,20 @@ struct MatchingWordCache {
     map: HashMap<(String, u8, bool), Rc<MatchingWord>>,
 }
 impl MatchingWordCache {
-    fn insert(&mut self, word: String, typo: u8, prefix: bool) -> Rc<MatchingWord> {
-        // Toggle the (un)commented code to switch between cached and non-cached
-        // implementations.
-
-        // self.all.push(MatchingWord::new(word, typo, prefix));
-        // self.all.len() - 1
+    fn insert(&mut self, word: String, typo: u8, prefix: bool) -> Option<Rc<MatchingWord>> {
         match self.map.entry((word.clone(), typo, prefix)) {
-            Entry::Occupied(idx) => idx.get().clone(),
+            Entry::Occupied(idx) => Some(idx.get().clone()),
             Entry::Vacant(vacant) => {
-                let matching_word = Rc::new(MatchingWord::new(word, typo, prefix));
+                let matching_word = Rc::new(MatchingWord::new(word, typo, prefix)?);
                 self.all.push(matching_word.clone());
                 vacant.insert(matching_word.clone());
-                matching_word
+                Some(matching_word)
             }
         }
+        // To deactivate the cache, for testing purposes, use the following instead:
+        // let matching_word = Rc::new(MatchingWord::new(word, typo, prefix)?);
+        // self.all.push(matching_word.clone());
+        // Some(matching_word)
     }
 }
 
@@ -591,16 +590,19 @@ fn create_matching_words(
                     for synonym in synonyms {
                         let synonym = synonym
                             .into_iter()
-                            .map(|syn| matching_word_cache.insert(syn, 0, false))
+                            .flat_map(|syn| matching_word_cache.insert(syn, 0, false))
                             .collect();
                         matching_words.push((synonym, vec![id]));
                     }
                 }
 
                 if let Some((left, right)) = split_best_frequency(ctx, &word)? {
-                    let left = matching_word_cache.insert(left.to_string(), 0, false);
-                    let right = matching_word_cache.insert(right.to_string(), 0, false);
-                    matching_words.push((vec![left, right], vec![id]));
+                    if let Some(left) = matching_word_cache.insert(left.to_string(), 0, false) {
+                        if let Some(right) = matching_word_cache.insert(right.to_string(), 0, false)
+                        {
+                            matching_words.push((vec![left, right], vec![id]));
+                        }
+                    }
                 }
 
                 let (word_len_one_typo, word_len_two_typo) = ctx.min_word_len_for_typo()?;
@@ -614,7 +616,9 @@ fn create_matching_words(
                         matching_word_cache.insert(word, typo, prefix)
                     }
                 };
-                matching_words.push((vec![matching_word], vec![id]));
+                if let Some(matching_word) = matching_word {
+                    matching_words.push((vec![matching_word], vec![id]));
+                }
             }
             // create a CONSECUTIVE matchings words wrapping all word in the phrase
             PrimitiveQueryPart::Phrase(words) => {
@@ -623,7 +627,7 @@ fn create_matching_words(
                 let words = words
                     .into_iter()
                     .flatten()
-                    .map(|w| matching_word_cache.insert(w, 0, false))
+                    .flat_map(|w| matching_word_cache.insert(w, 0, false))
                     .collect();
                 matching_words.push((words, ids));
             }
@@ -681,7 +685,7 @@ fn create_matching_words(
                                 for synonym in synonyms {
                                     let synonym = synonym
                                         .into_iter()
-                                        .map(|syn| matching_word_cache.insert(syn, 0, false))
+                                        .flat_map(|syn| matching_word_cache.insert(syn, 0, false))
                                         .collect();
                                     matching_words.push((synonym, ids.clone()));
                                 }
@@ -704,7 +708,9 @@ fn create_matching_words(
                                     matching_word_cache.insert(word, typo, is_prefix)
                                 }
                             };
-                            matching_words.push((vec![matching_word], ids));
+                            if let Some(matching_word) = matching_word {
+                                matching_words.push((vec![matching_word], ids));
+                            }
                         }
                     }
 
@@ -1341,6 +1347,27 @@ mod test {
         );
     }
 
+    #[test]
+    fn test_dont_create_matching_word_for_long_words() {
+        let index = TempIndex::new();
+        let rtxn = index.read_txn().unwrap();
+        let query = "what a supercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocious house";
+        let mut builder = QueryTreeBuilder::new(&rtxn, &index).unwrap();
+        builder.words_limit(10);
+        let (_, _, matching_words) = builder.build(query.tokenize()).unwrap().unwrap();
+        insta::assert_snapshot!(format!("{matching_words:?}"), @r###"
+        [
+        ([MatchingWord { word: "house", typo: 1, prefix: true }], [3])
+        ([MatchingWord { word: "house", typo: 1, prefix: true }], [2])
+        ([MatchingWord { word: "whata", typo: 1, prefix: false }], [0, 1])
+        ([MatchingWord { word: "house", typo: 1, prefix: true }], [2])
+        ([MatchingWord { word: "house", typo: 1, prefix: true }], [1])
+        ([MatchingWord { word: "what", typo: 0, prefix: false }], [0])
+        ([MatchingWord { word: "a", typo: 0, prefix: false }], [1])
+        ]
+        "###);
+    }
+
     #[test]
     fn disable_typo_on_word() {
         let query = "goodbye";
@@ -1380,9 +1407,8 @@ mod test {
         }
     }
 
-    // This test must be run
     #[test]
-    fn ten_words() {
+    fn memory_usage_of_ten_word_query() {
         let resident_before = ALLOC.resident.load(atomic::Ordering::SeqCst);
         let allocated_before = ALLOC.allocated.load(atomic::Ordering::SeqCst);
 
@@ -1395,12 +1421,20 @@ mod test {
         let resident_after = ALLOC.resident.load(atomic::Ordering::SeqCst);
         let allocated_after = ALLOC.allocated.load(atomic::Ordering::SeqCst);
 
-        insta::assert_snapshot!(format!("{}", resident_after - resident_before), @"4521710");
-        insta::assert_snapshot!(format!("{}", allocated_after - allocated_before), @"7259092");
+        // Weak check on the memory usage
+        // Don't keep more than 5MB. (Arguably 5MB is already too high)
+        assert!(resident_after - resident_before < 5_000_000);
+        // Don't allocate more than 10MB.
+        assert!(allocated_after - allocated_before < 10_000_000);
 
-        // Note, if the matching word cache is deactivated, the memory usage is:
-        // insta::assert_snapshot!(format!("{}", resident_after - resident_before), @"91311265");
-        // insta::assert_snapshot!(format!("{}", allocated_after - allocated_before), @"125948410");
+        // Use these snapshots to measure the exact memory usage.
+        // The values below were correct at the time I wrote them.
+        // insta::assert_snapshot!(format!("{}", resident_after - resident_before), @"4486950");
+        // insta::assert_snapshot!(format!("{}", allocated_after - allocated_before), @"7107502");
+
+        // Note, with the matching word cache deactivated, the memory usage was:
+        // insta::assert_snapshot!(format!("{}", resident_after - resident_before), @"91248697");
+        // insta::assert_snapshot!(format!("{}", allocated_after - allocated_before), @"125697588");
         // or about 20x more resident memory (90MB vs 4.5MB)
 
         // Use x
diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
index 8eae0caee..be9b479bb 100644
--- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
+++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
@@ -7,11 +7,11 @@ use charabia::{SeparatorKind, Token, TokenKind, TokenizerBuilder};
 use roaring::RoaringBitmap;
 use serde_json::Value;
 
-use super::helpers::{
-    concat_u32s_array, create_sorter, sorter_into_reader, GrenadParameters, MAX_WORD_LENGTH,
-};
+use super::helpers::{concat_u32s_array, create_sorter, sorter_into_reader, GrenadParameters};
 use crate::error::{InternalError, SerializationError};
-use crate::{absolute_from_relative_position, FieldId, Result, MAX_POSITION_PER_ATTRIBUTE};
+use crate::{
+    absolute_from_relative_position, FieldId, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH,
+};
 
 /// Extracts the word and positions where this word appear and
 /// prefixes it by the document id.
diff --git a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs
index 3a0af3c96..0d9c0981e 100644
--- a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs
@@ -6,9 +6,8 @@ use heed::BytesEncode;
 use super::helpers::{create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters};
 use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec};
 use crate::heed_codec::StrRefCodec;
-use crate::update::index_documents::helpers::MAX_FACET_VALUE_LENGTH;
 use crate::update::index_documents::merge_cbo_roaring_bitmaps;
-use crate::{FieldId, Result};
+use crate::{FieldId, Result, MAX_FACET_VALUE_LENGTH};
 
 /// Extracts the facet string and the documents ids where this facet string appear.
 ///
diff --git a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs
index b37cd90d3..0a7dfbeb1 100644
--- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs
+++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs
@@ -12,9 +12,8 @@ use serde_json::Value;
 use super::helpers::{create_sorter, keep_first, sorter_into_reader, GrenadParameters};
 use crate::error::InternalError;
 use crate::facet::value_encoding::f64_into_bytes;
-use crate::update::index_documents::helpers::MAX_FACET_VALUE_LENGTH;
 use crate::update::index_documents::{create_writer, writer_into_reader};
-use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result, BEU32};
+use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result, BEU32, MAX_FACET_VALUE_LENGTH};
 
 /// Extracts the facet values of each faceted field of each document.
 ///
diff --git a/milli/src/update/index_documents/helpers/mod.rs b/milli/src/update/index_documents/helpers/mod.rs
index e1f112858..a496ccd6e 100644
--- a/milli/src/update/index_documents/helpers/mod.rs
+++ b/milli/src/update/index_documents/helpers/mod.rs
@@ -18,20 +18,7 @@ pub use merge_functions::{
     serialize_roaring_bitmap, MergeFn,
 };
 
-/// The maximum length a LMDB key can be.
-///
-/// Note that the actual allowed length is a little bit higher, but
-/// we keep a margin of safety.
-const MAX_LMDB_KEY_LENGTH: usize = 500;
-
-/// The maximum length a field value can be when inserted in an LMDB key.
-///
-/// This number is determined by the keys of the different facet databases
-/// and adding a margin of safety.
-pub const MAX_FACET_VALUE_LENGTH: usize = MAX_LMDB_KEY_LENGTH - 20;
-
-/// The maximum length a word can be
-pub const MAX_WORD_LENGTH: usize = MAX_LMDB_KEY_LENGTH / 2;
+use crate::MAX_WORD_LENGTH;
 
 pub fn valid_lmdb_key(key: impl AsRef<[u8]>) -> bool {
     key.as_ref().len() <= MAX_WORD_LENGTH * 2 && !key.as_ref().is_empty()