From d68fe2b3c7600ec6c280693d338cba698cab1f77 Mon Sep 17 00:00:00 2001 From: ad hoc Date: Tue, 15 Mar 2022 15:56:07 +0100 Subject: [PATCH 1/3] optimize word prefix fst --- milli/src/update/words_prefixes_fst.rs | 39 +++++++++++++------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/milli/src/update/words_prefixes_fst.rs b/milli/src/update/words_prefixes_fst.rs index 49406deb5..0977bc9f0 100644 --- a/milli/src/update/words_prefixes_fst.rs +++ b/milli/src/update/words_prefixes_fst.rs @@ -1,7 +1,6 @@ use std::iter::FromIterator; -use std::str; -use fst::Streamer; +use fst::{SetBuilder, Streamer}; use crate::{Index, Result, SmallString32}; @@ -44,43 +43,45 @@ impl<'t, 'u, 'i> WordsPrefixesFst<'t, 'u, 'i> { pub fn execute(self) -> Result<()> { let words_fst = self.index.words_fst(&self.wtxn)?; - let mut prefix_fsts = Vec::with_capacity(self.max_prefix_length); - for n in 1..=self.max_prefix_length { - let mut current_prefix = SmallString32::new(); - let mut current_prefix_count = 0; - let mut builder = fst::SetBuilder::memory(); + let mut current_prefix = vec![SmallString32::new(); self.max_prefix_length]; + let mut current_prefix_count = vec![0; self.max_prefix_length]; + let mut builders: Vec<_> = + std::iter::repeat_with(SetBuilder::memory).take(self.max_prefix_length).collect(); + + let mut stream = words_fst.stream(); + while let Some(bytes) = stream.next() { + for n in 0..self.max_prefix_length { + let current_prefix = &mut current_prefix[n]; + let current_prefix_count = &mut current_prefix_count[n]; + let builder = &mut builders[n]; - let mut stream = words_fst.stream(); - while let Some(bytes) = stream.next() { // We try to get the first n bytes out of this string but we only want // to split at valid characters bounds. If we try to split in the middle of // a character we ignore this word and go to the next one. - let word = str::from_utf8(bytes)?; - let prefix = match word.get(..n) { + let word = std::str::from_utf8(bytes)?; + let prefix = match word.get(..=n) { Some(prefix) => prefix, None => continue, }; // This is the first iteration of the loop, // or the current word doesn't starts with the current prefix. - if current_prefix_count == 0 || prefix != current_prefix.as_str() { - current_prefix = SmallString32::from(prefix); - current_prefix_count = 0; + if *current_prefix_count == 0 || prefix != current_prefix.as_str() { + *current_prefix = SmallString32::from(prefix); + *current_prefix_count = 0; } - current_prefix_count += 1; + *current_prefix_count += 1; // There is enough words corresponding to this prefix to add it to the cache. - if current_prefix_count >= self.threshold { + if *current_prefix_count >= self.threshold { builder.insert(prefix)?; } } - - // We construct the final set for prefixes of size n. - prefix_fsts.push(builder.into_set()); } // We merge all of the previously computed prefixes into on final set. + let prefix_fsts: Vec<_> = builders.into_iter().map(|sb| sb.into_set()).collect(); let op = fst::set::OpBuilder::from_iter(prefix_fsts.iter()); let mut builder = fst::SetBuilder::memory(); builder.extend_stream(op.r#union())?; From d633ac5b9d6c7229d50b1eaacbe57ae9cc5d9ae6 Mon Sep 17 00:00:00 2001 From: ad hoc Date: Tue, 15 Mar 2022 16:37:22 +0100 Subject: [PATCH 2/3] optimize word prefix pair --- .../word_prefix_pair_proximity_docids.rs | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/milli/src/update/word_prefix_pair_proximity_docids.rs b/milli/src/update/word_prefix_pair_proximity_docids.rs index 284bb8981..be0ddf005 100644 --- a/milli/src/update/word_prefix_pair_proximity_docids.rs +++ b/milli/src/update/word_prefix_pair_proximity_docids.rs @@ -155,20 +155,20 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { // All of the word prefix pairs in the database that have a w2 // that is contained in the `suppr_pw` set must be removed as well. - let mut iter = self - .index - .word_prefix_pair_proximity_docids - .remap_data_type::() - .iter_mut(self.wtxn)?; - while let Some(((_, w2, _), _)) = iter.next().transpose()? { - if del_prefix_fst_words.contains(w2.as_bytes()) { - // Delete this entry as the w2 prefix is no more in the words prefix fst. - unsafe { iter.del_current()? }; + if !del_prefix_fst_words.is_empty() { + let mut iter = self + .index + .word_prefix_pair_proximity_docids + .remap_data_type::() + .iter_mut(self.wtxn)?; + while let Some(((_, w2, _), _)) = iter.next().transpose()? { + if del_prefix_fst_words.contains(w2.as_bytes()) { + // Delete this entry as the w2 prefix is no more in the words prefix fst. + unsafe { iter.del_current()? }; + } } } - drop(iter); - // We finally write and merge the new word prefix pair proximity docids // in the LMDB database. sorter_into_lmdb_database( From d127c57f2de034378fca1adec7c622744efbbf28 Mon Sep 17 00:00:00 2001 From: ad hoc Date: Tue, 15 Mar 2022 17:12:48 +0100 Subject: [PATCH 3/3] review edits --- milli/src/update/words_prefixes_fst.rs | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/milli/src/update/words_prefixes_fst.rs b/milli/src/update/words_prefixes_fst.rs index 0977bc9f0..95c9f3b01 100644 --- a/milli/src/update/words_prefixes_fst.rs +++ b/milli/src/update/words_prefixes_fst.rs @@ -1,4 +1,5 @@ -use std::iter::FromIterator; +use std::iter::{repeat_with, FromIterator}; +use std::str; use fst::{SetBuilder, Streamer}; @@ -45,8 +46,8 @@ impl<'t, 'u, 'i> WordsPrefixesFst<'t, 'u, 'i> { let mut current_prefix = vec![SmallString32::new(); self.max_prefix_length]; let mut current_prefix_count = vec![0; self.max_prefix_length]; - let mut builders: Vec<_> = - std::iter::repeat_with(SetBuilder::memory).take(self.max_prefix_length).collect(); + let mut builders = + repeat_with(SetBuilder::memory).take(self.max_prefix_length).collect::>(); let mut stream = words_fst.stream(); while let Some(bytes) = stream.next() { @@ -58,7 +59,7 @@ impl<'t, 'u, 'i> WordsPrefixesFst<'t, 'u, 'i> { // We try to get the first n bytes out of this string but we only want // to split at valid characters bounds. If we try to split in the middle of // a character we ignore this word and go to the next one. - let word = std::str::from_utf8(bytes)?; + let word = str::from_utf8(bytes)?; let prefix = match word.get(..=n) { Some(prefix) => prefix, None => continue,