From d68fe2b3c7600ec6c280693d338cba698cab1f77 Mon Sep 17 00:00:00 2001
From: ad hoc <postma.marin@protonmail.com>
Date: Tue, 15 Mar 2022 15:56:07 +0100
Subject: [PATCH 1/3] optimize word prefix fst

---
 milli/src/update/words_prefixes_fst.rs | 39 +++++++++++++-------------
 1 file changed, 20 insertions(+), 19 deletions(-)

diff --git a/milli/src/update/words_prefixes_fst.rs b/milli/src/update/words_prefixes_fst.rs
index 49406deb5..0977bc9f0 100644
--- a/milli/src/update/words_prefixes_fst.rs
+++ b/milli/src/update/words_prefixes_fst.rs
@@ -1,7 +1,6 @@
 use std::iter::FromIterator;
-use std::str;
 
-use fst::Streamer;
+use fst::{SetBuilder, Streamer};
 
 use crate::{Index, Result, SmallString32};
 
@@ -44,43 +43,45 @@ impl<'t, 'u, 'i> WordsPrefixesFst<'t, 'u, 'i> {
     pub fn execute(self) -> Result<()> {
         let words_fst = self.index.words_fst(&self.wtxn)?;
 
-        let mut prefix_fsts = Vec::with_capacity(self.max_prefix_length);
-        for n in 1..=self.max_prefix_length {
-            let mut current_prefix = SmallString32::new();
-            let mut current_prefix_count = 0;
-            let mut builder = fst::SetBuilder::memory();
+        let mut current_prefix = vec![SmallString32::new(); self.max_prefix_length];
+        let mut current_prefix_count = vec![0; self.max_prefix_length];
+        let mut builders: Vec<_> =
+            std::iter::repeat_with(SetBuilder::memory).take(self.max_prefix_length).collect();
+
+        let mut stream = words_fst.stream();
+        while let Some(bytes) = stream.next() {
+            for n in 0..self.max_prefix_length {
+                let current_prefix = &mut current_prefix[n];
+                let current_prefix_count = &mut current_prefix_count[n];
+                let builder = &mut builders[n];
 
-            let mut stream = words_fst.stream();
-            while let Some(bytes) = stream.next() {
                 // We try to get the first n bytes out of this string but we only want
                 // to split at valid characters bounds. If we try to split in the middle of
                 // a character we ignore this word and go to the next one.
-                let word = str::from_utf8(bytes)?;
-                let prefix = match word.get(..n) {
+                let word = std::str::from_utf8(bytes)?;
+                let prefix = match word.get(..=n) {
                     Some(prefix) => prefix,
                     None => continue,
                 };
 
                 // This is the first iteration of the loop,
                 // or the current word doesn't starts with the current prefix.
-                if current_prefix_count == 0 || prefix != current_prefix.as_str() {
-                    current_prefix = SmallString32::from(prefix);
-                    current_prefix_count = 0;
+                if *current_prefix_count == 0 || prefix != current_prefix.as_str() {
+                    *current_prefix = SmallString32::from(prefix);
+                    *current_prefix_count = 0;
                 }
 
-                current_prefix_count += 1;
+                *current_prefix_count += 1;
 
                 // There is enough words corresponding to this prefix to add it to the cache.
-                if current_prefix_count >= self.threshold {
+                if *current_prefix_count >= self.threshold {
                     builder.insert(prefix)?;
                 }
             }
-
-            // We construct the final set for prefixes of size n.
-            prefix_fsts.push(builder.into_set());
         }
 
         // We merge all of the previously computed prefixes into on final set.
+        let prefix_fsts: Vec<_> = builders.into_iter().map(|sb| sb.into_set()).collect();
         let op = fst::set::OpBuilder::from_iter(prefix_fsts.iter());
         let mut builder = fst::SetBuilder::memory();
         builder.extend_stream(op.r#union())?;

From d633ac5b9d6c7229d50b1eaacbe57ae9cc5d9ae6 Mon Sep 17 00:00:00 2001
From: ad hoc <postma.marin@protonmail.com>
Date: Tue, 15 Mar 2022 16:37:22 +0100
Subject: [PATCH 2/3] optimize word prefix pair

---
 .../word_prefix_pair_proximity_docids.rs      | 22 +++++++++----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/milli/src/update/word_prefix_pair_proximity_docids.rs b/milli/src/update/word_prefix_pair_proximity_docids.rs
index 284bb8981..be0ddf005 100644
--- a/milli/src/update/word_prefix_pair_proximity_docids.rs
+++ b/milli/src/update/word_prefix_pair_proximity_docids.rs
@@ -155,20 +155,20 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
 
         // All of the word prefix pairs in the database that have a w2
         // that is contained in the `suppr_pw` set must be removed as well.
-        let mut iter = self
-            .index
-            .word_prefix_pair_proximity_docids
-            .remap_data_type::<ByteSlice>()
-            .iter_mut(self.wtxn)?;
-        while let Some(((_, w2, _), _)) = iter.next().transpose()? {
-            if del_prefix_fst_words.contains(w2.as_bytes()) {
-                // Delete this entry as the w2 prefix is no more in the words prefix fst.
-                unsafe { iter.del_current()? };
+        if !del_prefix_fst_words.is_empty() {
+            let mut iter = self
+                .index
+                .word_prefix_pair_proximity_docids
+                .remap_data_type::<ByteSlice>()
+                .iter_mut(self.wtxn)?;
+            while let Some(((_, w2, _), _)) = iter.next().transpose()? {
+                if del_prefix_fst_words.contains(w2.as_bytes()) {
+                    // Delete this entry as the w2 prefix is no more in the words prefix fst.
+                    unsafe { iter.del_current()? };
+                }
             }
         }
 
-        drop(iter);
-
         // We finally write and merge the new word prefix pair proximity docids
         // in the LMDB database.
         sorter_into_lmdb_database(

From d127c57f2de034378fca1adec7c622744efbbf28 Mon Sep 17 00:00:00 2001
From: ad hoc <postma.marin@protonmail.com>
Date: Tue, 15 Mar 2022 17:12:48 +0100
Subject: [PATCH 3/3] review edits

---
 milli/src/update/words_prefixes_fst.rs | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/milli/src/update/words_prefixes_fst.rs b/milli/src/update/words_prefixes_fst.rs
index 0977bc9f0..95c9f3b01 100644
--- a/milli/src/update/words_prefixes_fst.rs
+++ b/milli/src/update/words_prefixes_fst.rs
@@ -1,4 +1,5 @@
-use std::iter::FromIterator;
+use std::iter::{repeat_with, FromIterator};
+use std::str;
 
 use fst::{SetBuilder, Streamer};
 
@@ -45,8 +46,8 @@ impl<'t, 'u, 'i> WordsPrefixesFst<'t, 'u, 'i> {
 
         let mut current_prefix = vec![SmallString32::new(); self.max_prefix_length];
         let mut current_prefix_count = vec![0; self.max_prefix_length];
-        let mut builders: Vec<_> =
-            std::iter::repeat_with(SetBuilder::memory).take(self.max_prefix_length).collect();
+        let mut builders =
+            repeat_with(SetBuilder::memory).take(self.max_prefix_length).collect::<Vec<_>>();
 
         let mut stream = words_fst.stream();
         while let Some(bytes) = stream.next() {
@@ -58,7 +59,7 @@ impl<'t, 'u, 'i> WordsPrefixesFst<'t, 'u, 'i> {
                 // We try to get the first n bytes out of this string but we only want
                 // to split at valid characters bounds. If we try to split in the middle of
                 // a character we ignore this word and go to the next one.
-                let word = std::str::from_utf8(bytes)?;
+                let word = str::from_utf8(bytes)?;
                 let prefix = match word.get(..=n) {
                     Some(prefix) => prefix,
                     None => continue,