From f04cd198866b49d67887981a2fd8f058aec1bbdb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 12 Jan 2022 16:14:53 +0100 Subject: [PATCH] Introduce a max prefix length parameter to the word prefix pair proximity update --- .../word_prefix_pair_proximity_docids.rs | 31 ++++++++++++++----- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/milli/src/update/word_prefix_pair_proximity_docids.rs b/milli/src/update/word_prefix_pair_proximity_docids.rs index b177e683d..808a0d8e4 100644 --- a/milli/src/update/word_prefix_pair_proximity_docids.rs +++ b/milli/src/update/word_prefix_pair_proximity_docids.rs @@ -19,6 +19,7 @@ pub struct WordPrefixPairProximityDocids<'t, 'u, 'i> { pub(crate) max_nb_chunks: Option, pub(crate) max_memory: Option, max_proximity: u8, + max_prefix_length: usize, } impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { @@ -34,6 +35,7 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { max_nb_chunks: None, max_memory: None, max_proximity: 4, + max_prefix_length: 2, } } @@ -48,6 +50,17 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { self } + /// Set the maximum length the prefix of a word pair is allowed to have be part of the words + /// prefixes database. If two words are two far from the threshold the associated documents + /// will not be part of the prefix database. + /// + /// Default value is 4. This value must be lower or equal than 4 and will be clamped + /// to this bound otherwise. + pub fn max_prefix_length(&mut self, value: usize) -> &mut Self { + self.max_prefix_length = value; + self + } + #[logging_timer::time("WordPrefixPairProximityDocids::{}")] pub fn execute(self) -> Result<()> { debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk..."); @@ -94,15 +107,17 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { buffer.clear(); buffer.extend_from_slice(w1.as_bytes()); buffer.push(0); - for prefix in prefixes.iter().filter(|prefix| w2.starts_with(prefix.as_str())) { - buffer.truncate(w1.len() + 1); - buffer.extend_from_slice(prefix.as_bytes()); - buffer.push(prox); + for prefix in prefixes.iter() { + if prefix.len() <= self.max_prefix_length && w2.starts_with(prefix) { + buffer.truncate(w1.len() + 1); + buffer.extend_from_slice(prefix.as_bytes()); + buffer.push(prox); - match prefixes_cache.get_mut(&buffer) { - Some(value) => value.push(data), - None => { - prefixes_cache.insert(buffer.clone(), vec![data]); + match prefixes_cache.get_mut(&buffer) { + Some(value) => value.push(data), + None => { + prefixes_cache.insert(buffer.clone(), vec![data]); + } } } }