From dbba5fd461337cc9ab3b9ba82e9a3bb64c444d73 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 25 Jan 2022 11:34:56 +0100 Subject: [PATCH] Create a function to simplify the word prefix pair proximity docids compute --- .../word_prefix_pair_proximity_docids.rs | 132 ++++++++++-------- 1 file changed, 72 insertions(+), 60 deletions(-) diff --git a/milli/src/update/word_prefix_pair_proximity_docids.rs b/milli/src/update/word_prefix_pair_proximity_docids.rs index dcc5db614..f846e8d9e 100644 --- a/milli/src/update/word_prefix_pair_proximity_docids.rs +++ b/milli/src/update/word_prefix_pair_proximity_docids.rs @@ -115,36 +115,18 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { continue; } - current_prefixes = match current_prefixes.take() { - Some(prefixes) if w2.starts_with(&prefixes[0]) => Some(prefixes), - _otherwise => { - write_prefixes_in_sorter( - &mut prefixes_cache, - &mut word_prefix_pair_proximity_docids_sorter, - )?; - common_prefix_fst_keys.iter().find(|prefixes| w2.starts_with(&prefixes[0])) - } - }; - - if let Some(prefixes) = current_prefixes { - buffer.clear(); - buffer.extend_from_slice(w1.as_bytes()); - buffer.push(0); - for prefix in prefixes.iter() { - if prefix.len() <= self.max_prefix_length && w2.starts_with(prefix) { - buffer.truncate(w1.len() + 1); - buffer.extend_from_slice(prefix.as_bytes()); - buffer.push(prox); - - match prefixes_cache.get_mut(&buffer) { - Some(value) => value.push(data.to_owned()), - None => { - prefixes_cache.insert(buffer.clone(), vec![data.to_owned()]); - } - } - } - } - } + insert_current_prefix_data_in_sorter( + &mut buffer, + &mut current_prefixes, + &mut prefixes_cache, + &mut word_prefix_pair_proximity_docids_sorter, + &common_prefix_fst_keys, + self.max_prefix_length, + w1, + w2, + prox, + data, + )?; } write_prefixes_in_sorter( @@ -165,36 +147,18 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { continue; } - current_prefixes = match current_prefixes.take() { - Some(prefixes) if w2.starts_with(&prefixes[0]) => Some(prefixes), - _otherwise => { - write_prefixes_in_sorter( - &mut prefixes_cache, - &mut word_prefix_pair_proximity_docids_sorter, - )?; - new_prefix_fst_keys.iter().find(|prefixes| w2.starts_with(&prefixes[0])) - } - }; - - if let Some(prefixes) = current_prefixes { - buffer.clear(); - buffer.extend_from_slice(w1.as_bytes()); - buffer.push(0); - for prefix in prefixes.iter() { - if prefix.len() <= self.max_prefix_length && w2.starts_with(prefix) { - buffer.truncate(w1.len() + 1); - buffer.extend_from_slice(prefix.as_bytes()); - buffer.push(prox); - - match prefixes_cache.get_mut(&buffer) { - Some(value) => value.push(data.to_owned()), - None => { - prefixes_cache.insert(buffer.clone(), vec![data.to_owned()]); - } - } - } - } - } + insert_current_prefix_data_in_sorter( + &mut buffer, + &mut current_prefixes, + &mut prefixes_cache, + &mut word_prefix_pair_proximity_docids_sorter, + &new_prefix_fst_keys, + self.max_prefix_length, + w1, + w2, + prox, + data, + )?; } write_prefixes_in_sorter( @@ -247,3 +211,51 @@ fn write_prefixes_in_sorter( Ok(()) } + +/// Computes the current prefix based on the previous and the currently iterated value +/// i.e. w1, w2, prox. It also makes sure to follow the `max_prefix_length` setting. +/// +/// Uses the current prefixes values to insert the associated data i.e. RoaringBitmap, +/// into the sorter that will, later, be inserted in the LMDB database. +fn insert_current_prefix_data_in_sorter<'a>( + buffer: &mut Vec, + current_prefixes: &mut Option<&'a &'a [String]>, + prefixes_cache: &mut HashMap, Vec>>, + word_prefix_pair_proximity_docids_sorter: &mut grenad::Sorter, + prefix_fst_keys: &'a [&'a [std::string::String]], + max_prefix_length: usize, + w1: &str, + w2: &str, + prox: u8, + data: &[u8], +) -> Result<()> { + *current_prefixes = match current_prefixes.take() { + Some(prefixes) if w2.starts_with(&prefixes[0]) => Some(prefixes), + _otherwise => { + write_prefixes_in_sorter(prefixes_cache, word_prefix_pair_proximity_docids_sorter)?; + prefix_fst_keys.iter().find(|prefixes| w2.starts_with(&prefixes[0])) + } + }; + + if let Some(prefixes) = current_prefixes { + buffer.clear(); + buffer.extend_from_slice(w1.as_bytes()); + buffer.push(0); + for prefix in prefixes.iter() { + if prefix.len() <= max_prefix_length && w2.starts_with(prefix) { + buffer.truncate(w1.len() + 1); + buffer.extend_from_slice(prefix.as_bytes()); + buffer.push(prox); + + match prefixes_cache.get_mut(buffer.as_slice()) { + Some(value) => value.push(data.to_owned()), + None => { + prefixes_cache.insert(buffer.clone(), vec![data.to_owned()]); + } + } + } + } + } + + Ok(()) +}