Compute word integer prefix cache

This commit is contained in:
ManyTheFish 2023-11-09 11:34:26 +01:00
parent 70ce40828c
commit 5a9c96e1db

View File

@ -9,9 +9,11 @@ use log::debug;
use crate::error::SerializationError;
use crate::heed_codec::StrBEU16Codec;
use crate::index::main_key::WORDS_PREFIXES_FST_KEY;
use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvWriterDelAdd};
use crate::update::index_documents::{
create_sorter, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, valid_lmdb_key,
CursorClonableMmap, MergeFn,
create_sorter, merge_deladd_cbo_roaring_bitmaps,
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, valid_lmdb_key,
write_sorter_into_database, CursorClonableMmap, MergeFn,
};
use crate::{CboRoaringBitmapCodec, Result};
@ -55,7 +57,7 @@ impl<'t, 'u, 'i> WordPrefixIntegerDocids<'t, 'u, 'i> {
let mut prefix_integer_docids_sorter = create_sorter(
grenad::SortAlgorithm::Unstable,
merge_cbo_roaring_bitmaps,
merge_deladd_cbo_roaring_bitmaps,
self.chunk_compression_type,
self.chunk_compression_level,
self.max_nb_chunks,
@ -108,6 +110,7 @@ impl<'t, 'u, 'i> WordPrefixIntegerDocids<'t, 'u, 'i> {
// We fetch the docids associated to the newly added word prefix fst only.
let db = self.word_database.remap_data_type::<ByteSlice>();
let mut buffer = Vec::new();
for prefix_bytes in new_prefix_fst_words {
let prefix = str::from_utf8(prefix_bytes.as_bytes()).map_err(|_| {
SerializationError::Decoding { db_name: Some(WORDS_PREFIXES_FST_KEY) }
@ -123,7 +126,11 @@ impl<'t, 'u, 'i> WordPrefixIntegerDocids<'t, 'u, 'i> {
if word.starts_with(prefix) {
let key = (prefix, pos);
let bytes = StrBEU16Codec::bytes_encode(&key).unwrap();
prefix_integer_docids_sorter.insert(bytes, data)?;
buffer.clear();
let mut writer = KvWriterDelAdd::new(&mut buffer);
writer.insert(DelAdd::Addition, data)?;
prefix_integer_docids_sorter.insert(bytes, writer.into_inner()?)?;
}
}
}
@ -143,12 +150,16 @@ impl<'t, 'u, 'i> WordPrefixIntegerDocids<'t, 'u, 'i> {
drop(iter);
}
let database_is_empty = self.prefix_database.is_empty(self.wtxn)?;
// We finally write all the word prefix integer docids into the LMDB database.
sorter_into_lmdb_database(
self.wtxn,
*self.prefix_database.as_polymorph(),
write_sorter_into_database(
prefix_integer_docids_sorter,
merge_cbo_roaring_bitmaps,
&self.prefix_database,
self.wtxn,
database_is_empty,
deladd_serialize_add_side,
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
)?;
Ok(())
@ -159,6 +170,7 @@ fn write_prefixes_in_sorter(
prefixes: &mut HashMap<Vec<u8>, Vec<Vec<u8>>>,
sorter: &mut grenad::Sorter<MergeFn>,
) -> Result<()> {
// TODO: Merge before insertion.
for (key, data_slices) in prefixes.drain() {
for data in data_slices {
if valid_lmdb_key(&key) {