Reduce incremental indexing time of words_prefix_position_docids DB

This database can easily contain millions of entries. Thus, iterating over it can be very expensive. For regular `documentAdditionOrUpdate` tasks, `del_prefix_fst_words` will always be empty. Thus, we can save a significant amount of time by adding this `if !del_prefix_fst_words.is_empty()` condition. The code's behaviour remains completely unchanged.
2024-11-23 02:27:40 +08:00 · 2023-01-31 11:42:24 +01:00 · 2023-01-31 11:42:24 +01:00 · a2690ea8d4
commit a2690ea8d4
parent 33f61d2cd4
1 changed files with 11 additions and 7 deletions
--- a/milli/src/update/words_prefix_position_docids.rs
+++ b/milli/src/update/words_prefix_position_docids.rs
@ -140,16 +140,20 @@ impl<'t, 'u, 'i> WordPrefixPositionDocids<'t, 'u, 'i> {
        // We remove all the entries that are no more required in this word prefix position
        // docids database.
-        let mut iter =
+        // We also avoid iterating over the whole `word_prefix_position_docids` database if we know in
-            self.index.word_prefix_position_docids.iter_mut(self.wtxn)?.lazily_decode_data();
+        // advance that the `if del_prefix_fst_words.contains(prefix.as_bytes()) {` condition below
-        while let Some(((prefix, _), _)) = iter.next().transpose()? {
+        // will always be false (i.e. if `del_prefix_fst_words` is empty).
-            if del_prefix_fst_words.contains(prefix.as_bytes()) {
+        if !del_prefix_fst_words.is_empty() {
-                unsafe { iter.del_current()? };
+            let mut iter =
                self.index.word_prefix_position_docids.iter_mut(self.wtxn)?.lazily_decode_data();
            while let Some(((prefix, _), _)) = iter.next().transpose()? {
                if del_prefix_fst_words.contains(prefix.as_bytes()) {
                    unsafe { iter.del_current()? };
                }
            }
            drop(iter);
        }
        drop(iter);
        // We finally write all the word prefix position docids into the LMDB database.
        sorter_into_lmdb_database(
            self.wtxn,