From 1ae13c137430bc88b9418d382964d362afb4af4e Mon Sep 17 00:00:00 2001
From: Kerollmops <clement@meilisearch.com>
Date: Tue, 1 Mar 2022 18:02:12 +0100
Subject: [PATCH] Avoid iterating on big databases when useless

---
 milli/src/update/word_prefix_docids.rs        |  47 ++++----
 .../word_prefix_pair_proximity_docids.rs      | 102 +++++++++---------
 .../update/words_prefix_position_docids.rs    |  60 ++++++-----
 3 files changed, 111 insertions(+), 98 deletions(-)
diff --git a/milli/src/update/word_prefix_docids.rs b/milli/src/update/word_prefix_docids.rs
index 2baaf2f19..076816f09 100644
--- a/milli/src/update/word_prefix_docids.rs
+++ b/milli/src/update/word_prefix_docids.rs
@@ -50,35 +50,38 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> {
             self.max_memory,
         );
 
-        let mut new_word_docids_iter = new_word_docids.into_cursor()?;
-        let mut current_prefixes: Option<&&[String]> = None;
-        let mut prefixes_cache = HashMap::new();
-        while let Some((word, data)) = new_word_docids_iter.move_on_next()? {
-            current_prefixes = match current_prefixes.take() {
-                Some(prefixes) if word.starts_with(&prefixes[0].as_bytes()) => Some(prefixes),
-                _otherwise => {
-                    write_prefixes_in_sorter(&mut prefixes_cache, &mut prefix_docids_sorter)?;
-                    common_prefix_fst_words
-                        .iter()
-                        .find(|prefixes| word.starts_with(&prefixes[0].as_bytes()))
-                }
-            };
+        if !common_prefix_fst_words.is_empty() {
+            let mut new_word_docids_iter = new_word_docids.into_cursor()?;
+            let mut current_prefixes: Option<&&[String]> = None;
+            let mut prefixes_cache = HashMap::new();
+            while let Some((word, data)) = new_word_docids_iter.move_on_next()? {
+                current_prefixes = match current_prefixes.take() {
+                    Some(prefixes) if word.starts_with(&prefixes[0].as_bytes()) => Some(prefixes),
+                    _otherwise => {
+                        write_prefixes_in_sorter(&mut prefixes_cache, &mut prefix_docids_sorter)?;
+                        common_prefix_fst_words
+                            .iter()
+                            .find(|prefixes| word.starts_with(&prefixes[0].as_bytes()))
+                    }
+                };
 
-            if let Some(prefixes) = current_prefixes {
-                for prefix in prefixes.iter() {
-                    if word.starts_with(prefix.as_bytes()) {
-                        match prefixes_cache.get_mut(prefix.as_bytes()) {
-                            Some(value) => value.push(data.to_owned()),
-                            None => {
-                                prefixes_cache.insert(prefix.clone().into(), vec![data.to_owned()]);
+                if let Some(prefixes) = current_prefixes {
+                    for prefix in prefixes.iter() {
+                        if word.starts_with(prefix.as_bytes()) {
+                            match prefixes_cache.get_mut(prefix.as_bytes()) {
+                                Some(value) => value.push(data.to_owned()),
+                                None => {
+                                    prefixes_cache
+                                        .insert(prefix.clone().into(), vec![data.to_owned()]);
+                                }
                             }
                         }
                     }
                 }
             }
-        }
 
-        write_prefixes_in_sorter(&mut prefixes_cache, &mut prefix_docids_sorter)?;
+            write_prefixes_in_sorter(&mut prefixes_cache, &mut prefix_docids_sorter)?;
+        }
 
         // We fetch the docids associated to the newly added word prefix fst only.
         let db = self.index.word_docids.remap_data_type::<ByteSlice>();
diff --git a/milli/src/update/word_prefix_pair_proximity_docids.rs b/milli/src/update/word_prefix_pair_proximity_docids.rs
index 692dd1568..284bb8981 100644
--- a/milli/src/update/word_prefix_pair_proximity_docids.rs
+++ b/milli/src/update/word_prefix_pair_proximity_docids.rs
@@ -83,70 +83,76 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
             self.max_memory,
         );
 
-        // We compute the prefix docids associated with the common prefixes between
-        // the old and new word prefix fst.
-        let mut buffer = Vec::new();
-        let mut current_prefixes: Option<&&[String]> = None;
-        let mut prefixes_cache = HashMap::new();
-        while let Some((key, data)) = new_wppd_iter.move_on_next()? {
-            let (w1, w2, prox) = StrStrU8Codec::bytes_decode(key).ok_or(heed::Error::Decoding)?;
-            if prox > self.max_proximity {
-                continue;
+        if !common_prefix_fst_words.is_empty() {
+            // We compute the prefix docids associated with the common prefixes between
+            // the old and new word prefix fst.
+            let mut buffer = Vec::new();
+            let mut current_prefixes: Option<&&[String]> = None;
+            let mut prefixes_cache = HashMap::new();
+            while let Some((key, data)) = new_wppd_iter.move_on_next()? {
+                let (w1, w2, prox) =
+                    StrStrU8Codec::bytes_decode(key).ok_or(heed::Error::Decoding)?;
+                if prox > self.max_proximity {
+                    continue;
+                }
+
+                insert_current_prefix_data_in_sorter(
+                    &mut buffer,
+                    &mut current_prefixes,
+                    &mut prefixes_cache,
+                    &mut word_prefix_pair_proximity_docids_sorter,
+                    common_prefix_fst_words,
+                    self.max_prefix_length,
+                    w1,
+                    w2,
+                    prox,
+                    data,
+                )?;
             }
 
-            insert_current_prefix_data_in_sorter(
-                &mut buffer,
-                &mut current_prefixes,
+            write_prefixes_in_sorter(
                 &mut prefixes_cache,
                 &mut word_prefix_pair_proximity_docids_sorter,
-                common_prefix_fst_words,
-                self.max_prefix_length,
-                w1,
-                w2,
-                prox,
-                data,
             )?;
         }
 
-        write_prefixes_in_sorter(
-            &mut prefixes_cache,
-            &mut word_prefix_pair_proximity_docids_sorter,
-        )?;
+        if !new_prefix_fst_words.is_empty() {
+            // We compute the prefix docids associated with the newly added prefixes
+            // in the new word prefix fst.
+            let mut db_iter = self
+                .index
+                .word_pair_proximity_docids
+                .remap_data_type::<ByteSlice>()
+                .iter(self.wtxn)?;
 
-        // We compute the prefix docids associated with the newly added prefixes
-        // in the new word prefix fst.
-        let mut db_iter =
-            self.index.word_pair_proximity_docids.remap_data_type::<ByteSlice>().iter(self.wtxn)?;
+            let mut buffer = Vec::new();
+            let mut current_prefixes: Option<&&[String]> = None;
+            let mut prefixes_cache = HashMap::new();
+            while let Some(((w1, w2, prox), data)) = db_iter.next().transpose()? {
+                if prox > self.max_proximity {
+                    continue;
+                }
 
-        let mut buffer = Vec::new();
-        let mut current_prefixes: Option<&&[String]> = None;
-        let mut prefixes_cache = HashMap::new();
-        while let Some(((w1, w2, prox), data)) = db_iter.next().transpose()? {
-            if prox > self.max_proximity {
-                continue;
+                insert_current_prefix_data_in_sorter(
+                    &mut buffer,
+                    &mut current_prefixes,
+                    &mut prefixes_cache,
+                    &mut word_prefix_pair_proximity_docids_sorter,
+                    &new_prefix_fst_words,
+                    self.max_prefix_length,
+                    w1,
+                    w2,
+                    prox,
+                    data,
+                )?;
             }
 
-            insert_current_prefix_data_in_sorter(
-                &mut buffer,
-                &mut current_prefixes,
+            write_prefixes_in_sorter(
                 &mut prefixes_cache,
                 &mut word_prefix_pair_proximity_docids_sorter,
-                &new_prefix_fst_words,
-                self.max_prefix_length,
-                w1,
-                w2,
-                prox,
-                data,
             )?;
         }
 
-        write_prefixes_in_sorter(
-            &mut prefixes_cache,
-            &mut word_prefix_pair_proximity_docids_sorter,
-        )?;
-
-        drop(db_iter);
-
         // All of the word prefix pairs in the database that have a w2
         // that is contained in the `suppr_pw` set must be removed as well.
         let mut iter = self
diff --git a/milli/src/update/words_prefix_position_docids.rs b/milli/src/update/words_prefix_position_docids.rs
index 324516325..77e9e7c29 100644
--- a/milli/src/update/words_prefix_position_docids.rs
+++ b/milli/src/update/words_prefix_position_docids.rs
@@ -74,42 +74,46 @@ impl<'t, 'u, 'i> WordPrefixPositionDocids<'t, 'u, 'i> {
 
         let mut new_word_position_docids_iter = new_word_position_docids.into_cursor()?;
 
-        // We fetch all the new common prefixes between the previous and new prefix fst.
-        let mut buffer = Vec::new();
-        let mut current_prefixes: Option<&&[String]> = None;
-        let mut prefixes_cache = HashMap::new();
-        while let Some((key, data)) = new_word_position_docids_iter.move_on_next()? {
-            let (word, pos) = StrBEU32Codec::bytes_decode(key).ok_or(heed::Error::Decoding)?;
+        if !common_prefix_fst_words.is_empty() {
+            // We fetch all the new common prefixes between the previous and new prefix fst.
+            let mut buffer = Vec::new();
+            let mut current_prefixes: Option<&&[String]> = None;
+            let mut prefixes_cache = HashMap::new();
+            while let Some((key, data)) = new_word_position_docids_iter.move_on_next()? {
+                let (word, pos) = StrBEU32Codec::bytes_decode(key).ok_or(heed::Error::Decoding)?;
 
-            current_prefixes = match current_prefixes.take() {
-                Some(prefixes) if word.starts_with(&prefixes[0]) => Some(prefixes),
-                _otherwise => {
-                    write_prefixes_in_sorter(
-                        &mut prefixes_cache,
-                        &mut prefix_position_docids_sorter,
-                    )?;
-                    common_prefix_fst_words.iter().find(|prefixes| word.starts_with(&prefixes[0]))
-                }
-            };
+                current_prefixes = match current_prefixes.take() {
+                    Some(prefixes) if word.starts_with(&prefixes[0]) => Some(prefixes),
+                    _otherwise => {
+                        write_prefixes_in_sorter(
+                            &mut prefixes_cache,
+                            &mut prefix_position_docids_sorter,
+                        )?;
+                        common_prefix_fst_words
+                            .iter()
+                            .find(|prefixes| word.starts_with(&prefixes[0]))
+                    }
+                };
 
-            if let Some(prefixes) = current_prefixes {
-                for prefix in prefixes.iter() {
-                    if word.starts_with(prefix) {
-                        buffer.clear();
-                        buffer.extend_from_slice(prefix.as_bytes());
-                        buffer.extend_from_slice(&pos.to_be_bytes());
-                        match prefixes_cache.get_mut(&buffer) {
-                            Some(value) => value.push(data.to_owned()),
-                            None => {
-                                prefixes_cache.insert(buffer.clone(), vec![data.to_owned()]);
+                if let Some(prefixes) = current_prefixes {
+                    for prefix in prefixes.iter() {
+                        if word.starts_with(prefix) {
+                            buffer.clear();
+                            buffer.extend_from_slice(prefix.as_bytes());
+                            buffer.extend_from_slice(&pos.to_be_bytes());
+                            match prefixes_cache.get_mut(&buffer) {
+                                Some(value) => value.push(data.to_owned()),
+                                None => {
+                                    prefixes_cache.insert(buffer.clone(), vec![data.to_owned()]);
+                                }
                             }
                         }
                     }
                 }
             }
-        }
 
-        write_prefixes_in_sorter(&mut prefixes_cache, &mut prefix_position_docids_sorter)?;
+            write_prefixes_in_sorter(&mut prefixes_cache, &mut prefix_position_docids_sorter)?;
+        }
 
         // We fetch the docids associated to the newly added word prefix fst only.
         let db = self.index.word_position_docids.remap_data_type::<ByteSlice>();