diff --git a/milli/src/update/prefix_word_pairs/prefix_word.rs b/milli/src/update/prefix_word_pairs/prefix_word.rs index 0cd55c929..8883cc451 100644 --- a/milli/src/update/prefix_word_pairs/prefix_word.rs +++ b/milli/src/update/prefix_word_pairs/prefix_word.rs @@ -35,9 +35,6 @@ pub fn index_prefix_word_database( .filter(|s| s.len() <= max_prefix_length) .collect(); - // If the prefix trie is not empty, then we can iterate over all new - // word pairs to look for new (word1, common_prefix, proximity) elements - // to insert in the DB for proximity in 1..=max_proximity - 1 { for prefix in common_prefixes.iter() { let mut prefix_key = vec![]; @@ -135,13 +132,11 @@ pub fn index_prefix_word_database( Ok(()) } -/// This is the core of the algorithm to initialise the Word Prefix Pair Proximity Docids database. +/// This is the core of the algorithm to initialise the Prefix Word Pair Proximity Docids database. /// -/// Its main arguments are: -/// 1. a sorted prefix iterator over ((word1, word2, proximity), docids) elements -/// 2. a closure to describe how to handle the new computed (word1, prefix, proximity) elements -/// -/// For more information about what this function does, read the module documentation. +/// Its arguments are: +/// - an iterator over the words following the given `prefix` with the given `proximity` +/// - a closure to describe how to handle the new computed (proximity, prefix, word2) elements fn execute_on_word_pairs_and_prefixes( proximity: u8, prefix: &[u8], @@ -151,28 +146,32 @@ fn execute_on_word_pairs_and_prefixes( ) -> Result<()> { let mut batch: BTreeMap, Vec>> = <_>::default(); - while let Some((word2, data)) = next_word2_and_docids(iter)? { + // Memory usage check: + // The content of the loop will be called for each `word2` that follows a word beginning + // with `prefix` with the given proximity. + // In practice, I don't think the batch can ever get too big. + while let Some((word2, docids)) = next_word2_and_docids(iter)? { let entry = batch.entry(word2.to_owned()).or_default(); - entry.push(Cow::Owned(data.to_owned())); + entry.push(Cow::Owned(docids.to_owned())); } - let mut key_buffer = Vec::with_capacity(8); + let mut key_buffer = Vec::with_capacity(512); key_buffer.push(proximity); key_buffer.extend_from_slice(prefix); key_buffer.push(0); let mut value_buffer = Vec::with_capacity(65_536); - for (key, values) in batch { + for (word2, docids) in batch { key_buffer.truncate(prefix.len() + 2); value_buffer.clear(); - key_buffer.extend_from_slice(&key); - let data = if values.len() > 1 { - CboRoaringBitmapCodec::merge_into(&values, &mut value_buffer)?; + key_buffer.extend_from_slice(&word2); + let data = if docids.len() > 1 { + CboRoaringBitmapCodec::merge_into(&docids, &mut value_buffer)?; value_buffer.as_slice() } else { - &values[0] + &docids[0] }; insert(key_buffer.as_slice(), data)?; } diff --git a/milli/src/update/prefix_word_pairs/word_prefix.rs b/milli/src/update/prefix_word_pairs/word_prefix.rs index 1c7a4fffe..eb0b05d89 100644 --- a/milli/src/update/prefix_word_pairs/word_prefix.rs +++ b/milli/src/update/prefix_word_pairs/word_prefix.rs @@ -1,5 +1,4 @@ /*! - ## What is WordPrefix? The word-prefix-pair-proximity-docids database is a database whose keys are of the form `(proximity, word, prefix)` and the values are roaring bitmaps of the documents which contain `word` followed by another word starting with @@ -320,7 +319,7 @@ fn execute_on_word_pairs_and_prefixes( let mut merge_buffer = Vec::with_capacity(65_536); while let Some(((proximity, word1, word2), data)) = next_word_pair_proximity(iter)? { - // skip this iteration if the proximity is over the threshold + // stop indexing if the proximity is over the threshold if proximity > max_proximity { break; };