Optimise WordPrefixPairProximityDocIds merge operation

This commit is contained in:
Loïc Lecrenier 2022-07-14 11:53:21 +02:00
parent d350114159
commit 044356d221

View File

@ -8,12 +8,11 @@ use std::borrow::Cow;
use std::cmp::Ordering; use std::cmp::Ordering;
use std::collections::HashSet; use std::collections::HashSet;
use std::io::BufReader; use std::io::BufReader;
use std::time::Instant;
use crate::update::index_documents::{ use crate::update::index_documents::{
create_writer, merge_cbo_roaring_bitmaps, CursorClonableMmap, create_writer, merge_cbo_roaring_bitmaps, CursorClonableMmap,
}; };
use crate::{Index, Result, UncheckedStrStrU8Codec}; use crate::{CboRoaringBitmapCodec, Index, Result, UncheckedStrStrU8Codec};
pub struct WordPrefixPairProximityDocids<'t, 'u, 'i> { pub struct WordPrefixPairProximityDocids<'t, 'u, 'i> {
wtxn: &'t mut heed::RwTxn<'i, 'u>, wtxn: &'t mut heed::RwTxn<'i, 'u>,
@ -189,6 +188,7 @@ fn execute_on_word_pairs_and_prefixes<Iter>(
let mut empty_prefixes = false; let mut empty_prefixes = false;
let mut prefix_buffer = allocations.take_byte_vector(); let mut prefix_buffer = allocations.take_byte_vector();
let mut merge_buffer = allocations.take_byte_vector();
while let Some(((word1, word2, proximity), data)) = next_word_pair_proximity(iter)? { while let Some(((word1, word2, proximity), data)) = next_word_pair_proximity(iter)? {
if proximity > max_proximity { if proximity > max_proximity {
@ -200,7 +200,7 @@ fn execute_on_word_pairs_and_prefixes<Iter>(
} }
let word1_different_than_prev = word1 != batch.word1; let word1_different_than_prev = word1 != batch.word1;
if word1_different_than_prev || word2_start_different_than_prev { if word1_different_than_prev || word2_start_different_than_prev {
batch.flush(allocations, &mut insert)?; batch.flush(allocations, &mut merge_buffer, &mut insert)?;
if word1_different_than_prev { if word1_different_than_prev {
prefix_search_start.0 = 0; prefix_search_start.0 = 0;
batch.word1.clear(); batch.word1.clear();
@ -231,7 +231,7 @@ fn execute_on_word_pairs_and_prefixes<Iter>(
prefix_buffer.clear(); prefix_buffer.clear();
} }
} }
batch.flush(allocations, &mut insert)?; batch.flush(allocations, &mut merge_buffer, &mut insert)?;
Ok(()) Ok(())
} }
/** /**
@ -307,12 +307,14 @@ impl PrefixAndProximityBatch {
fn flush( fn flush(
&mut self, &mut self,
allocations: &mut Allocations, allocations: &mut Allocations,
merge_buffer: &mut Vec<u8>,
insert: &mut impl for<'buffer> FnMut(&'buffer [u8], &'buffer [u8]) -> Result<()>, insert: &mut impl for<'buffer> FnMut(&'buffer [u8], &'buffer [u8]) -> Result<()>,
) -> Result<()> { ) -> Result<()> {
let PrefixAndProximityBatch { word1, batch } = self; let PrefixAndProximityBatch { word1, batch } = self;
if batch.is_empty() { if batch.is_empty() {
return Ok(()); return Ok(());
} }
merge_buffer.clear();
let mut buffer = allocations.take_byte_vector(); let mut buffer = allocations.take_byte_vector();
buffer.extend_from_slice(word1); buffer.extend_from_slice(word1);
@ -321,14 +323,15 @@ impl PrefixAndProximityBatch {
for (key, mergeable_data) in batch.drain(..) { for (key, mergeable_data) in batch.drain(..) {
buffer.truncate(word1.len() + 1); buffer.truncate(word1.len() + 1);
buffer.extend_from_slice(key.as_slice()); buffer.extend_from_slice(key.as_slice());
let merged;
let data = if mergeable_data.len() > 1 { let data = if mergeable_data.len() > 1 {
merged = merge_cbo_roaring_bitmaps(&buffer, &mergeable_data)?; CboRoaringBitmapCodec::merge_into(&mergeable_data, merge_buffer)?;
&merged merge_buffer.as_slice()
} else { } else {
&mergeable_data[0] &mergeable_data[0]
}; };
insert(buffer.as_slice(), data)?; insert(buffer.as_slice(), data)?;
merge_buffer.clear();
allocations.reclaim_byte_vector(key); allocations.reclaim_byte_vector(key);
allocations.reclaim_mergeable_data_vector(mergeable_data); allocations.reclaim_mergeable_data_vector(mergeable_data);
} }
@ -443,20 +446,17 @@ impl PrefixTrieNode {
let byte = word[0]; let byte = word[0];
if self.children[search_start.0].1 == byte { if self.children[search_start.0].1 == byte {
return true; return true;
} else if let Some(position) = } else {
self.children[search_start.0..].iter().position(|(_, c)| *c >= byte) match self.children[search_start.0..].binary_search_by_key(&byte, |x| x.1) {
{ Ok(position) => {
let (_, c) = self.children[search_start.0 + position];
if c == byte {
search_start.0 += position; search_start.0 += position;
true true
} else { }
Err(_) => {
search_start.0 = 0; search_start.0 = 0;
false false
} }
} else { }
search_start.0 = 0;
false
} }
} }