Use the sorter cache when collection prefix docids

This commit is contained in:
Clément Renault 2024-07-17 17:35:47 +02:00
parent a3beaa90c5
commit 0e08906fcb
No known key found for this signature in database
GPG Key ID: F250A4C4E3AE5F5F
4 changed files with 43 additions and 49 deletions

View File

@ -140,6 +140,10 @@ where
self.sorter.insert(key, value_writer.into_inner().unwrap()) self.sorter.insert(key, value_writer.into_inner().unwrap())
} }
pub fn direct_insert(&mut self, key: &[u8], val: &[u8]) -> Result<(), grenad::Error<U>> {
self.sorter.insert(key, val)
}
pub fn into_sorter(mut self) -> Result<grenad::Sorter<MF>, grenad::Error<U>> { pub fn into_sorter(mut self) -> Result<grenad::Sorter<MF>, grenad::Error<U>> {
let default_lru = LruCache::new(NonZeroUsize::MIN); let default_lru = LruCache::new(NonZeroUsize::MIN);
for (key, deladd) in mem::replace(&mut self.cache, default_lru) { for (key, deladd) in mem::replace(&mut self.cache, default_lru) {
@ -155,10 +159,6 @@ pub struct DelAddRoaringBitmap {
} }
impl DelAddRoaringBitmap { impl DelAddRoaringBitmap {
fn new_del_add(bitmap: RoaringBitmap) -> Self {
DelAddRoaringBitmap { del: Some(bitmap.clone()), add: Some(bitmap) }
}
fn new_del_add_u32(n: u32) -> Self { fn new_del_add_u32(n: u32) -> Self {
DelAddRoaringBitmap { DelAddRoaringBitmap {
del: Some(RoaringBitmap::from([n])), del: Some(RoaringBitmap::from([n])),

View File

@ -1,4 +1,4 @@
mod cache; pub mod cache;
mod enrich; mod enrich;
mod extract; mod extract;
mod helpers; mod helpers;

View File

@ -1,11 +1,13 @@
use std::collections::{HashMap, HashSet}; use std::collections::{HashMap, HashSet};
use std::num::NonZeroUsize;
use grenad::CompressionType; use grenad::CompressionType;
use heed::types::{Bytes, Str}; use heed::types::Str;
use heed::Database; use heed::Database;
use super::index_documents::cache::SorterCacheDelAddCboRoaringBitmap;
use super::index_documents::REDIS_CLIENT; use super::index_documents::REDIS_CLIENT;
use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvWriterDelAdd}; use crate::update::del_add::deladd_serialize_add_side;
use crate::update::index_documents::{ use crate::update::index_documents::{
create_sorter, merge_deladd_cbo_roaring_bitmaps, create_sorter, merge_deladd_cbo_roaring_bitmaps,
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, valid_lmdb_key, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, valid_lmdb_key,
@ -53,11 +55,9 @@ impl<'t, 'i> WordPrefixDocids<'t, 'i> {
common_prefix_fst_words: &[&[String]], common_prefix_fst_words: &[&[String]],
del_prefix_fst_words: &HashSet<Vec<u8>>, del_prefix_fst_words: &HashSet<Vec<u8>>,
) -> Result<()> { ) -> Result<()> {
let mut conn = REDIS_CLIENT.get_connection().unwrap();
// It is forbidden to keep a mutable reference into the database // It is forbidden to keep a mutable reference into the database
// and write into it at the same time, therefore we write into another file. // and write into it at the same time, therefore we write into another file.
let mut prefix_docids_sorter = create_sorter( let prefix_docids_sorter = create_sorter(
grenad::SortAlgorithm::Unstable, grenad::SortAlgorithm::Unstable,
merge_deladd_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps,
self.chunk_compression_type, self.chunk_compression_type,
@ -65,6 +65,11 @@ impl<'t, 'i> WordPrefixDocids<'t, 'i> {
self.max_nb_chunks, self.max_nb_chunks,
self.max_memory, self.max_memory,
); );
let mut cached_prefix_docids_sorter = SorterCacheDelAddCboRoaringBitmap::<20, MergeFn>::new(
NonZeroUsize::new(200).unwrap(),
prefix_docids_sorter,
REDIS_CLIENT.get_connection().unwrap(),
);
if !common_prefix_fst_words.is_empty() { if !common_prefix_fst_words.is_empty() {
let mut current_prefixes: Option<&&[String]> = None; let mut current_prefixes: Option<&&[String]> = None;
@ -76,8 +81,7 @@ impl<'t, 'i> WordPrefixDocids<'t, 'i> {
_otherwise => { _otherwise => {
write_prefixes_in_sorter( write_prefixes_in_sorter(
&mut prefixes_cache, &mut prefixes_cache,
&mut prefix_docids_sorter, &mut cached_prefix_docids_sorter,
&mut conn,
)?; )?;
common_prefix_fst_words common_prefix_fst_words
.iter() .iter()
@ -100,22 +104,17 @@ impl<'t, 'i> WordPrefixDocids<'t, 'i> {
} }
} }
write_prefixes_in_sorter(&mut prefixes_cache, &mut prefix_docids_sorter, &mut conn)?; write_prefixes_in_sorter(&mut prefixes_cache, &mut cached_prefix_docids_sorter)?;
} }
// We fetch the docids associated to the newly added word prefix fst only. // We fetch the docids associated to the newly added word prefix fst only.
let db = self.word_docids.remap_data_type::<Bytes>(); let db = self.word_docids.lazily_decode_data();
let mut buffer = Vec::new();
for prefix in new_prefix_fst_words { for prefix in new_prefix_fst_words {
let prefix = std::str::from_utf8(prefix.as_bytes())?; let prefix = std::str::from_utf8(prefix.as_bytes())?;
for result in db.prefix_iter(self.wtxn, prefix)? { for result in db.prefix_iter(self.wtxn, prefix)? {
let (_word, data) = result?; let (_word, lazy_data) = result?;
buffer.clear(); cached_prefix_docids_sorter
let mut writer = KvWriterDelAdd::new(&mut buffer); .insert_add(prefix.as_bytes(), lazy_data.decode().unwrap())?;
writer.insert(DelAdd::Addition, data)?;
redis::cmd("INCR").arg(prefix.as_bytes()).query::<usize>(&mut conn).unwrap();
prefix_docids_sorter.insert(prefix, writer.into_inner()?)?;
} }
} }
@ -133,7 +132,7 @@ impl<'t, 'i> WordPrefixDocids<'t, 'i> {
// We finally write the word prefix docids into the LMDB database. // We finally write the word prefix docids into the LMDB database.
write_sorter_into_database( write_sorter_into_database(
prefix_docids_sorter, cached_prefix_docids_sorter.into_sorter()?,
&self.word_prefix_docids, &self.word_prefix_docids,
self.wtxn, self.wtxn,
database_is_empty, database_is_empty,
@ -147,14 +146,12 @@ impl<'t, 'i> WordPrefixDocids<'t, 'i> {
fn write_prefixes_in_sorter( fn write_prefixes_in_sorter(
prefixes: &mut HashMap<Vec<u8>, Vec<Vec<u8>>>, prefixes: &mut HashMap<Vec<u8>, Vec<Vec<u8>>>,
sorter: &mut grenad::Sorter<MergeFn>, sorter: &mut SorterCacheDelAddCboRoaringBitmap<20, MergeFn>,
conn: &mut redis::Connection,
) -> Result<()> { ) -> Result<()> {
for (key, data_slices) in prefixes.drain() { for (key, data_slices) in prefixes.drain() {
for data in data_slices { for data in data_slices {
if valid_lmdb_key(&key) { if valid_lmdb_key(&key) {
redis::cmd("INCR").arg(key.as_slice()).query::<usize>(conn).unwrap(); sorter.direct_insert(&key, &data)?;
sorter.insert(&key, data)?;
} }
} }
} }

View File

@ -1,4 +1,5 @@
use std::collections::{HashMap, HashSet}; use std::collections::{HashMap, HashSet};
use std::num::NonZeroUsize;
use std::str; use std::str;
use grenad::CompressionType; use grenad::CompressionType;
@ -9,7 +10,8 @@ use tracing::debug;
use crate::error::SerializationError; use crate::error::SerializationError;
use crate::heed_codec::StrBEU16Codec; use crate::heed_codec::StrBEU16Codec;
use crate::index::main_key::WORDS_PREFIXES_FST_KEY; use crate::index::main_key::WORDS_PREFIXES_FST_KEY;
use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvWriterDelAdd}; use crate::update::del_add::deladd_serialize_add_side;
use crate::update::index_documents::cache::SorterCacheDelAddCboRoaringBitmap;
use crate::update::index_documents::{ use crate::update::index_documents::{
create_sorter, merge_deladd_cbo_roaring_bitmaps, create_sorter, merge_deladd_cbo_roaring_bitmaps,
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, valid_lmdb_key, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, valid_lmdb_key,
@ -59,9 +61,7 @@ impl<'t, 'i> WordPrefixIntegerDocids<'t, 'i> {
) -> Result<()> { ) -> Result<()> {
debug!("Computing and writing the word levels integers docids into LMDB on disk..."); debug!("Computing and writing the word levels integers docids into LMDB on disk...");
let mut conn = REDIS_CLIENT.get_connection().unwrap(); let prefix_integer_docids_sorter = create_sorter(
let mut prefix_integer_docids_sorter = create_sorter(
grenad::SortAlgorithm::Unstable, grenad::SortAlgorithm::Unstable,
merge_deladd_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps,
self.chunk_compression_type, self.chunk_compression_type,
@ -69,6 +69,12 @@ impl<'t, 'i> WordPrefixIntegerDocids<'t, 'i> {
self.max_nb_chunks, self.max_nb_chunks,
self.max_memory, self.max_memory,
); );
let mut cached_prefix_integer_docids_sorter =
SorterCacheDelAddCboRoaringBitmap::<20, MergeFn>::new(
NonZeroUsize::new(200).unwrap(),
prefix_integer_docids_sorter,
REDIS_CLIENT.get_connection().unwrap(),
);
if !common_prefix_fst_words.is_empty() { if !common_prefix_fst_words.is_empty() {
// We fetch all the new common prefixes between the previous and new prefix fst. // We fetch all the new common prefixes between the previous and new prefix fst.
@ -86,8 +92,7 @@ impl<'t, 'i> WordPrefixIntegerDocids<'t, 'i> {
_otherwise => { _otherwise => {
write_prefixes_in_sorter( write_prefixes_in_sorter(
&mut prefixes_cache, &mut prefixes_cache,
&mut prefix_integer_docids_sorter, &mut cached_prefix_integer_docids_sorter,
&mut conn,
)?; )?;
common_prefix_fst_words common_prefix_fst_words
.iter() .iter()
@ -115,14 +120,12 @@ impl<'t, 'i> WordPrefixIntegerDocids<'t, 'i> {
write_prefixes_in_sorter( write_prefixes_in_sorter(
&mut prefixes_cache, &mut prefixes_cache,
&mut prefix_integer_docids_sorter, &mut cached_prefix_integer_docids_sorter,
&mut conn,
)?; )?;
} }
// We fetch the docids associated to the newly added word prefix fst only. // We fetch the docids associated to the newly added word prefix fst only.
let db = self.word_database.remap_data_type::<Bytes>(); let db = self.word_database.lazily_decode_data();
let mut buffer = Vec::new();
for prefix_bytes in new_prefix_fst_words { for prefix_bytes in new_prefix_fst_words {
let prefix = str::from_utf8(prefix_bytes.as_bytes()).map_err(|_| { let prefix = str::from_utf8(prefix_bytes.as_bytes()).map_err(|_| {
SerializationError::Decoding { db_name: Some(WORDS_PREFIXES_FST_KEY) } SerializationError::Decoding { db_name: Some(WORDS_PREFIXES_FST_KEY) }
@ -134,16 +137,12 @@ impl<'t, 'i> WordPrefixIntegerDocids<'t, 'i> {
.prefix_iter(self.wtxn, prefix_bytes.as_bytes())? .prefix_iter(self.wtxn, prefix_bytes.as_bytes())?
.remap_key_type::<StrBEU16Codec>(); .remap_key_type::<StrBEU16Codec>();
for result in iter { for result in iter {
let ((word, pos), data) = result?; let ((word, pos), lazy_data) = result?;
if word.starts_with(prefix) { if word.starts_with(prefix) {
let key = (prefix, pos); let key = (prefix, pos);
let bytes = StrBEU16Codec::bytes_encode(&key).unwrap(); let bytes = StrBEU16Codec::bytes_encode(&key).unwrap();
cached_prefix_integer_docids_sorter
buffer.clear(); .insert_add(&bytes, lazy_data.decode().unwrap())?;
let mut writer = KvWriterDelAdd::new(&mut buffer);
writer.insert(DelAdd::Addition, data)?;
redis::cmd("INCR").arg(bytes.as_ref()).query::<usize>(&mut conn).unwrap();
prefix_integer_docids_sorter.insert(bytes, writer.into_inner()?)?;
} }
} }
} }
@ -167,7 +166,7 @@ impl<'t, 'i> WordPrefixIntegerDocids<'t, 'i> {
// We finally write all the word prefix integer docids into the LMDB database. // We finally write all the word prefix integer docids into the LMDB database.
write_sorter_into_database( write_sorter_into_database(
prefix_integer_docids_sorter, cached_prefix_integer_docids_sorter.into_sorter()?,
&self.prefix_database, &self.prefix_database,
self.wtxn, self.wtxn,
database_is_empty, database_is_empty,
@ -181,15 +180,13 @@ impl<'t, 'i> WordPrefixIntegerDocids<'t, 'i> {
fn write_prefixes_in_sorter( fn write_prefixes_in_sorter(
prefixes: &mut HashMap<Vec<u8>, Vec<Vec<u8>>>, prefixes: &mut HashMap<Vec<u8>, Vec<Vec<u8>>>,
sorter: &mut grenad::Sorter<MergeFn>, sorter: &mut SorterCacheDelAddCboRoaringBitmap<20, MergeFn>,
conn: &mut redis::Connection,
) -> Result<()> { ) -> Result<()> {
// TODO: Merge before insertion. // TODO: Merge before insertion.
for (key, data_slices) in prefixes.drain() { for (key, data_slices) in prefixes.drain() {
for data in data_slices { for data in data_slices {
if valid_lmdb_key(&key) { if valid_lmdb_key(&key) {
redis::cmd("INCR").arg(key.as_slice()).query::<usize>(conn).unwrap(); sorter.direct_insert(&key, &data)?;
sorter.insert(&key, data)?;
} }
} }
} }