From e3c34684c6d9708621ecc4a5f1473f55d1b97119 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= <clement@meilisearch.com>
Date: Wed, 12 Jan 2022 15:22:06 +0100
Subject: [PATCH 1/5] Fix a bug where we were skipping most of the prefix pairs

---
 .../word_prefix_pair_proximity_docids.rs      | 24 +++++++------------
 1 file changed, 9 insertions(+), 15 deletions(-)
diff --git a/milli/src/update/word_prefix_pair_proximity_docids.rs b/milli/src/update/word_prefix_pair_proximity_docids.rs
index eb098a91f..8180cefd4 100644
--- a/milli/src/update/word_prefix_pair_proximity_docids.rs
+++ b/milli/src/update/word_prefix_pair_proximity_docids.rs
@@ -64,28 +64,26 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
         );
 
         let prefix_fst = self.index.words_prefixes_fst(self.wtxn)?;
-        let prefix_fst_keys = prefix_fst.into_stream().into_bytes();
-        let prefix_fst_keys: Vec<_> = prefix_fst_keys
-            .as_slice()
-            .linear_group_by_key(|x| std::str::from_utf8(&x).unwrap().chars().nth(0).unwrap())
-            .collect();
+        let prefix_fst_keys = prefix_fst.into_stream().into_strs()?;
+        let prefix_fst_keys: Vec<_> =
+            prefix_fst_keys.as_slice().linear_group_by_key(|x| x.chars().nth(0).unwrap()).collect();
 
         let mut db =
             self.index.word_pair_proximity_docids.remap_data_type::<ByteSlice>().iter(self.wtxn)?;
 
         let mut buffer = Vec::new();
-        let mut current_prefixes: Option<&&[Vec<u8>]> = None;
+        let mut current_prefixes: Option<&&[String]> = None;
         let mut prefixes_cache = HashMap::new();
         while let Some(((w1, w2, prox), data)) = db.next().transpose()? {
             current_prefixes = match current_prefixes.take() {
-                Some(prefixes) if w2.as_bytes().starts_with(&prefixes[0]) => Some(prefixes),
+                Some(prefixes) if w2.starts_with(&prefixes[0]) => Some(prefixes),
                 _otherwise => {
                     write_prefixes_in_sorter(
                         &mut prefixes_cache,
                         &mut word_prefix_pair_proximity_docids_sorter,
                         self.threshold,
                     )?;
-                    prefix_fst_keys.iter().find(|prefixes| w2.as_bytes().starts_with(&prefixes[0]))
+                    prefix_fst_keys.iter().find(|prefixes| w2.starts_with(&prefixes[0]))
                 }
             };
 
@@ -93,9 +91,9 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
                 buffer.clear();
                 buffer.extend_from_slice(w1.as_bytes());
                 buffer.push(0);
-                for prefix in prefixes.iter().filter(|prefix| w2.as_bytes().starts_with(prefix)) {
+                for prefix in prefixes.iter().filter(|prefix| w2.starts_with(prefix.as_str())) {
                     buffer.truncate(w1.len() + 1);
-                    buffer.extend_from_slice(prefix);
+                    buffer.extend_from_slice(prefix.as_bytes());
                     buffer.push(prox);
 
                     match prefixes_cache.get_mut(&buffer) {
@@ -135,17 +133,13 @@ fn write_prefixes_in_sorter(
     sorter: &mut grenad::Sorter<MergeFn>,
     min_word_per_prefix: u32,
 ) -> Result<()> {
-    for (i, (key, data_slices)) in prefixes.drain().enumerate() {
+    for (key, data_slices) in prefixes.drain() {
         // if the number of words prefixed by the prefix is higher than the threshold,
         // we insert it in the sorter.
         if data_slices.len() > min_word_per_prefix as usize {
             for data in data_slices {
                 sorter.insert(&key, data)?;
             }
-        // if the first prefix isn't elligible for insertion,
-        // then the other prefixes can't be elligible.
-        } else if i == 0 {
-            break;
         }
     }
 

From 23ea3ad738f8d1c22f08acd1b01cac3c9aefde45 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= <clement@meilisearch.com>
Date: Wed, 12 Jan 2022 15:23:46 +0100
Subject: [PATCH 2/5] Remove the useless threshold when computing the word
 prefix pair proximity

---
 .../word_prefix_pair_proximity_docids.rs      | 24 ++-----------------
 1 file changed, 2 insertions(+), 22 deletions(-)

diff --git a/milli/src/update/word_prefix_pair_proximity_docids.rs b/milli/src/update/word_prefix_pair_proximity_docids.rs
index 8180cefd4..1227ac08e 100644
--- a/milli/src/update/word_prefix_pair_proximity_docids.rs
+++ b/milli/src/update/word_prefix_pair_proximity_docids.rs
@@ -18,7 +18,6 @@ pub struct WordPrefixPairProximityDocids<'t, 'u, 'i> {
     pub(crate) chunk_compression_level: Option<u32>,
     pub(crate) max_nb_chunks: Option<usize>,
     pub(crate) max_memory: Option<usize>,
-    threshold: u32,
 }
 
 impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
@@ -33,21 +32,9 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
             chunk_compression_level: None,
             max_nb_chunks: None,
             max_memory: None,
-            threshold: 100,
         }
     }
 
-    /// Set the number of words required to make a prefix be part of the words prefixes
-    /// database. If a word prefix is supposed to match more than this number of words in the
-    /// dictionnary, therefore this prefix is added to the words prefixes datastructures.
-    ///
-    /// Default value is 100. This value must be higher than 50 and will be clamped
-    /// to these bound otherwise.
-    pub fn threshold(&mut self, value: u32) -> &mut Self {
-        self.threshold = value.max(50);
-        self
-    }
-
     #[logging_timer::time("WordPrefixPairProximityDocids::{}")]
     pub fn execute(self) -> Result<()> {
         debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk...");
@@ -81,7 +68,6 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
                     write_prefixes_in_sorter(
                         &mut prefixes_cache,
                         &mut word_prefix_pair_proximity_docids_sorter,
-                        self.threshold,
                     )?;
                     prefix_fst_keys.iter().find(|prefixes| w2.starts_with(&prefixes[0]))
                 }
@@ -109,7 +95,6 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
         write_prefixes_in_sorter(
             &mut prefixes_cache,
             &mut word_prefix_pair_proximity_docids_sorter,
-            self.threshold,
         )?;
 
         drop(prefix_fst);
@@ -131,15 +116,10 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
 fn write_prefixes_in_sorter(
     prefixes: &mut HashMap<Vec<u8>, Vec<&[u8]>>,
     sorter: &mut grenad::Sorter<MergeFn>,
-    min_word_per_prefix: u32,
 ) -> Result<()> {
     for (key, data_slices) in prefixes.drain() {
-        // if the number of words prefixed by the prefix is higher than the threshold,
-        // we insert it in the sorter.
-        if data_slices.len() > min_word_per_prefix as usize {
-            for data in data_slices {
-                sorter.insert(&key, data)?;
-            }
+        for data in data_slices {
+            sorter.insert(&key, data)?;
         }
     }
 

From 1514dfa1b7c630657790c4380ac6ae46ebd125f5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= <clement@meilisearch.com>
Date: Wed, 12 Jan 2022 15:40:51 +0100
Subject: [PATCH 3/5] Introduce a max proximity parameter to the word prefix
 pair proximity update

---
 .../update/word_prefix_pair_proximity_docids.rs | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/milli/src/update/word_prefix_pair_proximity_docids.rs b/milli/src/update/word_prefix_pair_proximity_docids.rs
index 1227ac08e..b177e683d 100644
--- a/milli/src/update/word_prefix_pair_proximity_docids.rs
+++ b/milli/src/update/word_prefix_pair_proximity_docids.rs
@@ -18,6 +18,7 @@ pub struct WordPrefixPairProximityDocids<'t, 'u, 'i> {
     pub(crate) chunk_compression_level: Option<u32>,
     pub(crate) max_nb_chunks: Option<usize>,
     pub(crate) max_memory: Option<usize>,
+    max_proximity: u8,
 }
 
 impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
@@ -32,9 +33,21 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
             chunk_compression_level: None,
             max_nb_chunks: None,
             max_memory: None,
+            max_proximity: 4,
         }
     }
 
+    /// Set the maximum proximity required to make a prefix be part of the words prefixes
+    /// database. If two words are two far from the threshold the associated documents will
+    /// not be part of the prefix database.
+    ///
+    /// Default value is 4. This value must be lower or equal than 4 and will be clamped
+    /// to this bound otherwise.
+    pub fn max_proximity(&mut self, value: u8) -> &mut Self {
+        self.max_proximity = value.max(7);
+        self
+    }
+
     #[logging_timer::time("WordPrefixPairProximityDocids::{}")]
     pub fn execute(self) -> Result<()> {
         debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk...");
@@ -62,6 +75,10 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
         let mut current_prefixes: Option<&&[String]> = None;
         let mut prefixes_cache = HashMap::new();
         while let Some(((w1, w2, prox), data)) = db.next().transpose()? {
+            if prox > self.max_proximity {
+                continue;
+            }
+
             current_prefixes = match current_prefixes.take() {
                 Some(prefixes) if w2.starts_with(&prefixes[0]) => Some(prefixes),
                 _otherwise => {

From f04cd198866b49d67887981a2fd8f058aec1bbdb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= <clement@meilisearch.com>
Date: Wed, 12 Jan 2022 16:14:53 +0100
Subject: [PATCH 4/5] Introduce a max prefix length parameter to the word
 prefix pair proximity update

---
 .../word_prefix_pair_proximity_docids.rs      | 31 ++++++++++++++-----
 1 file changed, 23 insertions(+), 8 deletions(-)

diff --git a/milli/src/update/word_prefix_pair_proximity_docids.rs b/milli/src/update/word_prefix_pair_proximity_docids.rs
index b177e683d..808a0d8e4 100644
--- a/milli/src/update/word_prefix_pair_proximity_docids.rs
+++ b/milli/src/update/word_prefix_pair_proximity_docids.rs
@@ -19,6 +19,7 @@ pub struct WordPrefixPairProximityDocids<'t, 'u, 'i> {
     pub(crate) max_nb_chunks: Option<usize>,
     pub(crate) max_memory: Option<usize>,
     max_proximity: u8,
+    max_prefix_length: usize,
 }
 
 impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
@@ -34,6 +35,7 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
             max_nb_chunks: None,
             max_memory: None,
             max_proximity: 4,
+            max_prefix_length: 2,
         }
     }
 
@@ -48,6 +50,17 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
         self
     }
 
+    /// Set the maximum length the prefix of a word pair is allowed to have be part of the words
+    /// prefixes database. If two words are two far from the threshold the associated documents
+    /// will not be part of the prefix database.
+    ///
+    /// Default value is 4. This value must be lower or equal than 4 and will be clamped
+    /// to this bound otherwise.
+    pub fn max_prefix_length(&mut self, value: usize) -> &mut Self {
+        self.max_prefix_length = value;
+        self
+    }
+
     #[logging_timer::time("WordPrefixPairProximityDocids::{}")]
     pub fn execute(self) -> Result<()> {
         debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk...");
@@ -94,15 +107,17 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
                 buffer.clear();
                 buffer.extend_from_slice(w1.as_bytes());
                 buffer.push(0);
-                for prefix in prefixes.iter().filter(|prefix| w2.starts_with(prefix.as_str())) {
-                    buffer.truncate(w1.len() + 1);
-                    buffer.extend_from_slice(prefix.as_bytes());
-                    buffer.push(prox);
+                for prefix in prefixes.iter() {
+                    if prefix.len() <= self.max_prefix_length && w2.starts_with(prefix) {
+                        buffer.truncate(w1.len() + 1);
+                        buffer.extend_from_slice(prefix.as_bytes());
+                        buffer.push(prox);
 
-                    match prefixes_cache.get_mut(&buffer) {
-                        Some(value) => value.push(data),
-                        None => {
-                            prefixes_cache.insert(buffer.clone(), vec![data]);
+                        match prefixes_cache.get_mut(&buffer) {
+                            Some(value) => value.push(data),
+                            None => {
+                                prefixes_cache.insert(buffer.clone(), vec![data]);
+                            }
                         }
                     }
                 }

From f9b214f34e6b189877178a7cb6b6cf7f8bfa19d8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= <clement@meilisearch.com>
Date: Wed, 26 Jan 2022 11:28:11 +0100
Subject: [PATCH 5/5] Apply suggestions from code review

Co-authored-by: Many <legendre.maxime.isn@gmail.com>
---
 milli/src/update/word_prefix_pair_proximity_docids.rs | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/milli/src/update/word_prefix_pair_proximity_docids.rs b/milli/src/update/word_prefix_pair_proximity_docids.rs
index 808a0d8e4..2dc00fb90 100644
--- a/milli/src/update/word_prefix_pair_proximity_docids.rs
+++ b/milli/src/update/word_prefix_pair_proximity_docids.rs
@@ -40,22 +40,21 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
     }
 
     /// Set the maximum proximity required to make a prefix be part of the words prefixes
-    /// database. If two words are two far from the threshold the associated documents will
+    /// database. If two words are too far from the threshold the associated documents will
     /// not be part of the prefix database.
     ///
-    /// Default value is 4. This value must be lower or equal than 4 and will be clamped
+    /// Default value is 4. This value must be lower or equal than 7 and will be clamped
     /// to this bound otherwise.
     pub fn max_proximity(&mut self, value: u8) -> &mut Self {
         self.max_proximity = value.max(7);
         self
     }
 
-    /// Set the maximum length the prefix of a word pair is allowed to have be part of the words
-    /// prefixes database. If two words are two far from the threshold the associated documents
+    /// Set the maximum length the prefix of a word pair is allowed to have to be part of the words
+    /// prefixes database. If the prefix length is higher than the threshold, the associated documents
     /// will not be part of the prefix database.
     ///
-    /// Default value is 4. This value must be lower or equal than 4 and will be clamped
-    /// to this bound otherwise.
+    /// Default value is 2.
     pub fn max_prefix_length(&mut self, value: usize) -> &mut Self {
         self.max_prefix_length = value;
         self