From a983129613d1f9340484e648536fcce4c4f4303d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 20 Oct 2022 09:49:37 +0200 Subject: [PATCH] Apply suggestions from code review --- milli/src/search/criteria/mod.rs | 5 ++-- milli/src/update/index_documents/mod.rs | 8 ++++++- milli/src/update/prefix_word_pairs/mod.rs | 24 +++++++++++++++++-- .../update/prefix_word_pairs/prefix_word.rs | 9 ++++--- .../update/prefix_word_pairs/word_prefix.rs | 7 ++++-- 5 files changed, 42 insertions(+), 11 deletions(-) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 234252ff2..4069306b3 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -448,9 +448,8 @@ pub fn resolve_phrase(ctx: &dyn Context, phrase: &[String]) -> Result bitmap |= m, - None => {} + if let Some(m) = ctx.word_pair_proximity_docids(s1, s2, dist as u8 + 1)? { + bitmap |= m } } if bitmap.is_empty() { diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 897f2f8f8..5550c8725 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -528,7 +528,13 @@ where if let Some(word_pair_proximity_docids) = word_pair_proximity_docids { // Run the word prefix pair proximity docids update operation. - PrefixWordPairsProximityDocids::new(self.wtxn, self.index).execute( + PrefixWordPairsProximityDocids::new( + self.wtxn, + self.index, + self.indexer_config.chunk_compression_type, + self.indexer_config.chunk_compression_level, + ) + .execute( word_pair_proximity_docids, &new_prefix_fst_words, &common_prefix_fst_words, diff --git a/milli/src/update/prefix_word_pairs/mod.rs b/milli/src/update/prefix_word_pairs/mod.rs index 1549acf40..03abdbb6e 100644 --- a/milli/src/update/prefix_word_pairs/mod.rs +++ b/milli/src/update/prefix_word_pairs/mod.rs @@ -2,6 +2,7 @@ use std::borrow::Cow; use std::collections::HashSet; use std::io::BufReader; +use grenad::CompressionType; use heed::types::ByteSlice; use super::index_documents::{merge_cbo_roaring_bitmaps, CursorClonableMmap}; @@ -18,10 +19,24 @@ pub struct PrefixWordPairsProximityDocids<'t, 'u, 'i> { index: &'i Index, max_proximity: u8, max_prefix_length: usize, + chunk_compression_type: CompressionType, + chunk_compression_level: Option, } impl<'t, 'u, 'i> PrefixWordPairsProximityDocids<'t, 'u, 'i> { - pub fn new(wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index) -> Self { - Self { wtxn, index, max_proximity: 4, max_prefix_length: 2 } + pub fn new( + wtxn: &'t mut heed::RwTxn<'i, 'u>, + index: &'i Index, + chunk_compression_type: CompressionType, + chunk_compression_level: Option, + ) -> Self { + Self { + wtxn, + index, + max_proximity: 4, + max_prefix_length: 2, + chunk_compression_type, + chunk_compression_level, + } } /// Set the maximum proximity required to make a prefix be part of the words prefixes /// database. If two words are too far from the threshold the associated documents will @@ -42,6 +57,7 @@ impl<'t, 'u, 'i> PrefixWordPairsProximityDocids<'t, 'u, 'i> { self.max_prefix_length = value; self } + #[logging_timer::time("WordPrefixPairProximityDocids::{}")] pub fn execute<'a>( self, @@ -60,6 +76,8 @@ impl<'t, 'u, 'i> PrefixWordPairsProximityDocids<'t, 'u, 'i> { new_prefix_fst_words, common_prefix_fst_words, del_prefix_fst_words, + self.chunk_compression_type, + self.chunk_compression_level, )?; index_prefix_word_database( @@ -72,6 +90,8 @@ impl<'t, 'u, 'i> PrefixWordPairsProximityDocids<'t, 'u, 'i> { new_prefix_fst_words, common_prefix_fst_words, del_prefix_fst_words, + self.chunk_compression_type, + self.chunk_compression_level, )?; Ok(()) diff --git a/milli/src/update/prefix_word_pairs/prefix_word.rs b/milli/src/update/prefix_word_pairs/prefix_word.rs index 8883cc451..9bc184825 100644 --- a/milli/src/update/prefix_word_pairs/prefix_word.rs +++ b/milli/src/update/prefix_word_pairs/prefix_word.rs @@ -23,6 +23,8 @@ pub fn index_prefix_word_database( new_prefix_fst_words: &[String], common_prefix_fst_words: &[&[String]], del_prefix_fst_words: &HashSet>, + chunk_compression_type: CompressionType, + chunk_compression_level: Option, ) -> Result<()> { let max_proximity = max_proximity - 1; debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk..."); @@ -35,7 +37,7 @@ pub fn index_prefix_word_database( .filter(|s| s.len() <= max_prefix_length) .collect(); - for proximity in 1..=max_proximity - 1 { + for proximity in 1..max_proximity { for prefix in common_prefixes.iter() { let mut prefix_key = vec![]; prefix_key.push(proximity); @@ -78,7 +80,8 @@ pub fn index_prefix_word_database( // Since we read the DB, we can't write to it directly, so we add each new (word1, prefix, proximity) // element in an intermediary grenad - let mut writer = create_writer(CompressionType::None, None, tempfile::tempfile()?); + let mut writer = + create_writer(chunk_compression_type, chunk_compression_level, tempfile::tempfile()?); for proximity in 1..=max_proximity - 1 { for prefix in new_prefixes.iter() { @@ -144,7 +147,7 @@ fn execute_on_word_pairs_and_prefixes( mut next_word2_and_docids: impl for<'a> FnMut(&'a mut I) -> Result>, mut insert: impl for<'a> FnMut(&'a [u8], &'a [u8]) -> Result<()>, ) -> Result<()> { - let mut batch: BTreeMap, Vec>> = <_>::default(); + let mut batch: BTreeMap, Vec>> = BTreeMap::default(); // Memory usage check: // The content of the loop will be called for each `word2` that follows a word beginning diff --git a/milli/src/update/prefix_word_pairs/word_prefix.rs b/milli/src/update/prefix_word_pairs/word_prefix.rs index eb0b05d89..5895cdc46 100644 --- a/milli/src/update/prefix_word_pairs/word_prefix.rs +++ b/milli/src/update/prefix_word_pairs/word_prefix.rs @@ -187,6 +187,8 @@ pub fn index_word_prefix_database( new_prefix_fst_words: &[String], common_prefix_fst_words: &[&[String]], del_prefix_fst_words: &HashSet>, + chunk_compression_type: CompressionType, + chunk_compression_level: Option, ) -> Result<()> { debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk..."); @@ -249,7 +251,8 @@ pub fn index_word_prefix_database( // Since we read the DB, we can't write to it directly, so we add each new (proximity, word1, prefix) // element in an intermediary grenad - let mut writer = create_writer(CompressionType::None, None, tempfile::tempfile()?); + let mut writer = + create_writer(chunk_compression_type, chunk_compression_level, tempfile::tempfile()?); execute_on_word_pairs_and_prefixes( &mut db_iter, @@ -325,7 +328,7 @@ fn execute_on_word_pairs_and_prefixes( }; let word2_start_different_than_prev = word2[0] != prev_word2_start; // if there were no potential prefixes for the previous word2 based on its first letter, - // and if the current word2 starts with the s`ame letter, then there is also no potential + // and if the current word2 starts with the same letter, then there is also no potential // prefixes for the current word2, and we can skip to the next iteration if empty_prefixes && !word2_start_different_than_prev { continue;