From 1ae13c137430bc88b9418d382964d362afb4af4e Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 1 Mar 2022 18:02:12 +0100 Subject: [PATCH] Avoid iterating on big databases when useless --- milli/src/update/word_prefix_docids.rs | 47 ++++---- .../word_prefix_pair_proximity_docids.rs | 102 +++++++++--------- .../update/words_prefix_position_docids.rs | 60 ++++++----- 3 files changed, 111 insertions(+), 98 deletions(-) diff --git a/milli/src/update/word_prefix_docids.rs b/milli/src/update/word_prefix_docids.rs index 2baaf2f19..076816f09 100644 --- a/milli/src/update/word_prefix_docids.rs +++ b/milli/src/update/word_prefix_docids.rs @@ -50,35 +50,38 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { self.max_memory, ); - let mut new_word_docids_iter = new_word_docids.into_cursor()?; - let mut current_prefixes: Option<&&[String]> = None; - let mut prefixes_cache = HashMap::new(); - while let Some((word, data)) = new_word_docids_iter.move_on_next()? { - current_prefixes = match current_prefixes.take() { - Some(prefixes) if word.starts_with(&prefixes[0].as_bytes()) => Some(prefixes), - _otherwise => { - write_prefixes_in_sorter(&mut prefixes_cache, &mut prefix_docids_sorter)?; - common_prefix_fst_words - .iter() - .find(|prefixes| word.starts_with(&prefixes[0].as_bytes())) - } - }; + if !common_prefix_fst_words.is_empty() { + let mut new_word_docids_iter = new_word_docids.into_cursor()?; + let mut current_prefixes: Option<&&[String]> = None; + let mut prefixes_cache = HashMap::new(); + while let Some((word, data)) = new_word_docids_iter.move_on_next()? { + current_prefixes = match current_prefixes.take() { + Some(prefixes) if word.starts_with(&prefixes[0].as_bytes()) => Some(prefixes), + _otherwise => { + write_prefixes_in_sorter(&mut prefixes_cache, &mut prefix_docids_sorter)?; + common_prefix_fst_words + .iter() + .find(|prefixes| word.starts_with(&prefixes[0].as_bytes())) + } + }; - if let Some(prefixes) = current_prefixes { - for prefix in prefixes.iter() { - if word.starts_with(prefix.as_bytes()) { - match prefixes_cache.get_mut(prefix.as_bytes()) { - Some(value) => value.push(data.to_owned()), - None => { - prefixes_cache.insert(prefix.clone().into(), vec![data.to_owned()]); + if let Some(prefixes) = current_prefixes { + for prefix in prefixes.iter() { + if word.starts_with(prefix.as_bytes()) { + match prefixes_cache.get_mut(prefix.as_bytes()) { + Some(value) => value.push(data.to_owned()), + None => { + prefixes_cache + .insert(prefix.clone().into(), vec![data.to_owned()]); + } } } } } } - } - write_prefixes_in_sorter(&mut prefixes_cache, &mut prefix_docids_sorter)?; + write_prefixes_in_sorter(&mut prefixes_cache, &mut prefix_docids_sorter)?; + } // We fetch the docids associated to the newly added word prefix fst only. let db = self.index.word_docids.remap_data_type::(); diff --git a/milli/src/update/word_prefix_pair_proximity_docids.rs b/milli/src/update/word_prefix_pair_proximity_docids.rs index 692dd1568..284bb8981 100644 --- a/milli/src/update/word_prefix_pair_proximity_docids.rs +++ b/milli/src/update/word_prefix_pair_proximity_docids.rs @@ -83,70 +83,76 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { self.max_memory, ); - // We compute the prefix docids associated with the common prefixes between - // the old and new word prefix fst. - let mut buffer = Vec::new(); - let mut current_prefixes: Option<&&[String]> = None; - let mut prefixes_cache = HashMap::new(); - while let Some((key, data)) = new_wppd_iter.move_on_next()? { - let (w1, w2, prox) = StrStrU8Codec::bytes_decode(key).ok_or(heed::Error::Decoding)?; - if prox > self.max_proximity { - continue; + if !common_prefix_fst_words.is_empty() { + // We compute the prefix docids associated with the common prefixes between + // the old and new word prefix fst. + let mut buffer = Vec::new(); + let mut current_prefixes: Option<&&[String]> = None; + let mut prefixes_cache = HashMap::new(); + while let Some((key, data)) = new_wppd_iter.move_on_next()? { + let (w1, w2, prox) = + StrStrU8Codec::bytes_decode(key).ok_or(heed::Error::Decoding)?; + if prox > self.max_proximity { + continue; + } + + insert_current_prefix_data_in_sorter( + &mut buffer, + &mut current_prefixes, + &mut prefixes_cache, + &mut word_prefix_pair_proximity_docids_sorter, + common_prefix_fst_words, + self.max_prefix_length, + w1, + w2, + prox, + data, + )?; } - insert_current_prefix_data_in_sorter( - &mut buffer, - &mut current_prefixes, + write_prefixes_in_sorter( &mut prefixes_cache, &mut word_prefix_pair_proximity_docids_sorter, - common_prefix_fst_words, - self.max_prefix_length, - w1, - w2, - prox, - data, )?; } - write_prefixes_in_sorter( - &mut prefixes_cache, - &mut word_prefix_pair_proximity_docids_sorter, - )?; + if !new_prefix_fst_words.is_empty() { + // We compute the prefix docids associated with the newly added prefixes + // in the new word prefix fst. + let mut db_iter = self + .index + .word_pair_proximity_docids + .remap_data_type::() + .iter(self.wtxn)?; - // We compute the prefix docids associated with the newly added prefixes - // in the new word prefix fst. - let mut db_iter = - self.index.word_pair_proximity_docids.remap_data_type::().iter(self.wtxn)?; + let mut buffer = Vec::new(); + let mut current_prefixes: Option<&&[String]> = None; + let mut prefixes_cache = HashMap::new(); + while let Some(((w1, w2, prox), data)) = db_iter.next().transpose()? { + if prox > self.max_proximity { + continue; + } - let mut buffer = Vec::new(); - let mut current_prefixes: Option<&&[String]> = None; - let mut prefixes_cache = HashMap::new(); - while let Some(((w1, w2, prox), data)) = db_iter.next().transpose()? { - if prox > self.max_proximity { - continue; + insert_current_prefix_data_in_sorter( + &mut buffer, + &mut current_prefixes, + &mut prefixes_cache, + &mut word_prefix_pair_proximity_docids_sorter, + &new_prefix_fst_words, + self.max_prefix_length, + w1, + w2, + prox, + data, + )?; } - insert_current_prefix_data_in_sorter( - &mut buffer, - &mut current_prefixes, + write_prefixes_in_sorter( &mut prefixes_cache, &mut word_prefix_pair_proximity_docids_sorter, - &new_prefix_fst_words, - self.max_prefix_length, - w1, - w2, - prox, - data, )?; } - write_prefixes_in_sorter( - &mut prefixes_cache, - &mut word_prefix_pair_proximity_docids_sorter, - )?; - - drop(db_iter); - // All of the word prefix pairs in the database that have a w2 // that is contained in the `suppr_pw` set must be removed as well. let mut iter = self diff --git a/milli/src/update/words_prefix_position_docids.rs b/milli/src/update/words_prefix_position_docids.rs index 324516325..77e9e7c29 100644 --- a/milli/src/update/words_prefix_position_docids.rs +++ b/milli/src/update/words_prefix_position_docids.rs @@ -74,42 +74,46 @@ impl<'t, 'u, 'i> WordPrefixPositionDocids<'t, 'u, 'i> { let mut new_word_position_docids_iter = new_word_position_docids.into_cursor()?; - // We fetch all the new common prefixes between the previous and new prefix fst. - let mut buffer = Vec::new(); - let mut current_prefixes: Option<&&[String]> = None; - let mut prefixes_cache = HashMap::new(); - while let Some((key, data)) = new_word_position_docids_iter.move_on_next()? { - let (word, pos) = StrBEU32Codec::bytes_decode(key).ok_or(heed::Error::Decoding)?; + if !common_prefix_fst_words.is_empty() { + // We fetch all the new common prefixes between the previous and new prefix fst. + let mut buffer = Vec::new(); + let mut current_prefixes: Option<&&[String]> = None; + let mut prefixes_cache = HashMap::new(); + while let Some((key, data)) = new_word_position_docids_iter.move_on_next()? { + let (word, pos) = StrBEU32Codec::bytes_decode(key).ok_or(heed::Error::Decoding)?; - current_prefixes = match current_prefixes.take() { - Some(prefixes) if word.starts_with(&prefixes[0]) => Some(prefixes), - _otherwise => { - write_prefixes_in_sorter( - &mut prefixes_cache, - &mut prefix_position_docids_sorter, - )?; - common_prefix_fst_words.iter().find(|prefixes| word.starts_with(&prefixes[0])) - } - }; + current_prefixes = match current_prefixes.take() { + Some(prefixes) if word.starts_with(&prefixes[0]) => Some(prefixes), + _otherwise => { + write_prefixes_in_sorter( + &mut prefixes_cache, + &mut prefix_position_docids_sorter, + )?; + common_prefix_fst_words + .iter() + .find(|prefixes| word.starts_with(&prefixes[0])) + } + }; - if let Some(prefixes) = current_prefixes { - for prefix in prefixes.iter() { - if word.starts_with(prefix) { - buffer.clear(); - buffer.extend_from_slice(prefix.as_bytes()); - buffer.extend_from_slice(&pos.to_be_bytes()); - match prefixes_cache.get_mut(&buffer) { - Some(value) => value.push(data.to_owned()), - None => { - prefixes_cache.insert(buffer.clone(), vec![data.to_owned()]); + if let Some(prefixes) = current_prefixes { + for prefix in prefixes.iter() { + if word.starts_with(prefix) { + buffer.clear(); + buffer.extend_from_slice(prefix.as_bytes()); + buffer.extend_from_slice(&pos.to_be_bytes()); + match prefixes_cache.get_mut(&buffer) { + Some(value) => value.push(data.to_owned()), + None => { + prefixes_cache.insert(buffer.clone(), vec![data.to_owned()]); + } } } } } } - } - write_prefixes_in_sorter(&mut prefixes_cache, &mut prefix_position_docids_sorter)?; + write_prefixes_in_sorter(&mut prefixes_cache, &mut prefix_position_docids_sorter)?; + } // We fetch the docids associated to the newly added word prefix fst only. let db = self.index.word_position_docids.remap_data_type::();