Measure the SmallVec efficacity

This commit is contained in:
Clément Renault 2024-09-24 15:32:15 +02:00
parent 4ce5d3d66d
commit 7f148c127c
No known key found for this signature in database
GPG Key ID: F250A4C4E3AE5F5F
4 changed files with 42 additions and 25 deletions

View File

@ -15,6 +15,8 @@ pub struct CboCachedSorter<MF> {
sorter: Sorter<MF>, sorter: Sorter<MF>,
deladd_buffer: Vec<u8>, deladd_buffer: Vec<u8>,
cbo_buffer: Vec<u8>, cbo_buffer: Vec<u8>,
total_insertions: usize,
fitted_in_key: usize,
} }
impl<MF> CboCachedSorter<MF> { impl<MF> CboCachedSorter<MF> {
@ -24,6 +26,8 @@ impl<MF> CboCachedSorter<MF> {
sorter, sorter,
deladd_buffer: Vec::new(), deladd_buffer: Vec::new(),
cbo_buffer: Vec::new(), cbo_buffer: Vec::new(),
total_insertions: 0,
fitted_in_key: 0,
} }
} }
} }
@ -35,6 +39,8 @@ impl<MF: MergeFunction> CboCachedSorter<MF> {
del.get_or_insert_with(PushOptimizedBitmap::default).insert(n); del.get_or_insert_with(PushOptimizedBitmap::default).insert(n);
} }
None => { None => {
self.total_insertions += 1;
self.fitted_in_key += (key.len() <= 20) as usize;
let value = DelAddRoaringBitmap::new_del_u32(n); let value = DelAddRoaringBitmap::new_del_u32(n);
if let Some((key, deladd)) = self.cache.push(key.into(), value) { if let Some((key, deladd)) = self.cache.push(key.into(), value) {
self.write_entry(key, deladd)?; self.write_entry(key, deladd)?;
@ -55,6 +61,8 @@ impl<MF: MergeFunction> CboCachedSorter<MF> {
del.get_or_insert_with(PushOptimizedBitmap::default).union_with_bitmap(bitmap); del.get_or_insert_with(PushOptimizedBitmap::default).union_with_bitmap(bitmap);
} }
None => { None => {
self.total_insertions += 1;
self.fitted_in_key += (key.len() <= 20) as usize;
let value = DelAddRoaringBitmap::new_del(bitmap); let value = DelAddRoaringBitmap::new_del(bitmap);
if let Some((key, deladd)) = self.cache.push(key.into(), value) { if let Some((key, deladd)) = self.cache.push(key.into(), value) {
self.write_entry(key, deladd)?; self.write_entry(key, deladd)?;
@ -71,6 +79,8 @@ impl<MF: MergeFunction> CboCachedSorter<MF> {
add.get_or_insert_with(PushOptimizedBitmap::default).insert(n); add.get_or_insert_with(PushOptimizedBitmap::default).insert(n);
} }
None => { None => {
self.total_insertions += 1;
self.fitted_in_key += (key.len() <= 20) as usize;
let value = DelAddRoaringBitmap::new_add_u32(n); let value = DelAddRoaringBitmap::new_add_u32(n);
if let Some((key, deladd)) = self.cache.push(key.into(), value) { if let Some((key, deladd)) = self.cache.push(key.into(), value) {
self.write_entry(key, deladd)?; self.write_entry(key, deladd)?;
@ -91,6 +101,8 @@ impl<MF: MergeFunction> CboCachedSorter<MF> {
add.get_or_insert_with(PushOptimizedBitmap::default).union_with_bitmap(bitmap); add.get_or_insert_with(PushOptimizedBitmap::default).union_with_bitmap(bitmap);
} }
None => { None => {
self.total_insertions += 1;
self.fitted_in_key += (key.len() <= 20) as usize;
let value = DelAddRoaringBitmap::new_add(bitmap); let value = DelAddRoaringBitmap::new_add(bitmap);
if let Some((key, deladd)) = self.cache.push(key.into(), value) { if let Some((key, deladd)) = self.cache.push(key.into(), value) {
self.write_entry(key, deladd)?; self.write_entry(key, deladd)?;
@ -108,6 +120,8 @@ impl<MF: MergeFunction> CboCachedSorter<MF> {
add.get_or_insert_with(PushOptimizedBitmap::default).insert(n); add.get_or_insert_with(PushOptimizedBitmap::default).insert(n);
} }
None => { None => {
self.total_insertions += 1;
self.fitted_in_key += (key.len() <= 20) as usize;
let value = DelAddRoaringBitmap::new_del_add_u32(n); let value = DelAddRoaringBitmap::new_del_add_u32(n);
if let Some((key, deladd)) = self.cache.push(key.into(), value) { if let Some((key, deladd)) = self.cache.push(key.into(), value) {
self.write_entry(key, deladd)?; self.write_entry(key, deladd)?;
@ -161,14 +175,22 @@ impl<MF: MergeFunction> CboCachedSorter<MF> {
for (key, deladd) in mem::replace(&mut self.cache, default_arc) { for (key, deladd) in mem::replace(&mut self.cache, default_arc) {
self.write_entry(key, deladd)?; self.write_entry(key, deladd)?;
} }
tracing::info!(
"LruCache stats: {} <= 20 bytes ({}%) on a total of {} insertions",
self.fitted_in_key,
(self.fitted_in_key as f32 / self.total_insertions as f32) * 100.0,
self.total_insertions,
);
Ok(self.sorter) Ok(self.sorter)
} }
} }
#[derive(Debug, Clone)] #[derive(Debug, Clone)]
pub struct DelAddRoaringBitmap { pub struct DelAddRoaringBitmap {
pub del: Option<PushOptimizedBitmap>, pub(crate) del: Option<PushOptimizedBitmap>,
pub add: Option<PushOptimizedBitmap>, pub(crate) add: Option<PushOptimizedBitmap>,
} }
impl DelAddRoaringBitmap { impl DelAddRoaringBitmap {

View File

@ -1,26 +1,21 @@
use std::borrow::Cow;
use std::collections::HashMap; use std::collections::HashMap;
use std::{borrow::Cow, fs::File, num::NonZero}; use std::fs::File;
use std::num::NonZero;
use grenad::Merger; use grenad::{Merger, MergerBuilder};
use grenad::MergerBuilder;
use heed::RoTxn; use heed::RoTxn;
use rayon::iter::IntoParallelIterator; use rayon::iter::{IntoParallelIterator, ParallelIterator};
use rayon::iter::ParallelIterator;
use super::{ use super::tokenize_document::{tokenizer_builder, DocumentTokenizer};
tokenize_document::{tokenizer_builder, DocumentTokenizer}, use super::SearchableExtractor;
SearchableExtractor, use crate::update::new::extract::cache::CboCachedSorter;
};
use crate::update::new::extract::perm_json_p::contained_in; use crate::update::new::extract::perm_json_p::contained_in;
use crate::DocumentId; use crate::update::new::{DocumentChange, ItemsPool};
use crate::update::{create_sorter, GrenadParameters, MergeDeladdCboRoaringBitmaps};
use crate::{ use crate::{
bucketed_position, bucketed_position, DocumentId, FieldId, GlobalFieldsIdsMap, Index, Result,
update::{ MAX_POSITION_PER_ATTRIBUTE,
create_sorter,
new::{extract::cache::CboCachedSorter, DocumentChange, ItemsPool},
GrenadParameters, MergeDeladdCboRoaringBitmaps,
},
FieldId, GlobalFieldsIdsMap, Index, Result, MAX_POSITION_PER_ATTRIBUTE,
}; };
const MAX_COUNTED_WORDS: usize = 30; const MAX_COUNTED_WORDS: usize = 30;
@ -565,7 +560,7 @@ impl WordDocidsExtractors {
cached_sorter: &mut WordDocidsCachedSorters, cached_sorter: &mut WordDocidsCachedSorters,
document_change: DocumentChange, document_change: DocumentChange,
) -> Result<()> { ) -> Result<()> {
let exact_attributes = index.exact_attributes(&rtxn)?; let exact_attributes = index.exact_attributes(rtxn)?;
let is_exact_attribute = let is_exact_attribute =
|fname: &str| exact_attributes.iter().any(|attr| contained_in(fname, attr)); |fname: &str| exact_attributes.iter().any(|attr| contained_in(fname, attr));
let mut buffer = Vec::new(); let mut buffer = Vec::new();

View File

@ -59,7 +59,7 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor {
DocumentChange::Update(inner) => { DocumentChange::Update(inner) => {
let document = inner.current(rtxn, index)?.unwrap(); let document = inner.current(rtxn, index)?.unwrap();
process_document_tokens( process_document_tokens(
&document, document,
document_tokenizer, document_tokenizer,
fields_ids_map, fields_ids_map,
&mut word_positions, &mut word_positions,

View File

@ -92,24 +92,24 @@ impl<'a> DocumentTokenizer<'a> {
}; };
// if the current field is searchable or contains a searchable attribute // if the current field is searchable or contains a searchable attribute
if select_field(&field_name, self.attribute_to_extract, self.attribute_to_skip) { if select_field(field_name, self.attribute_to_extract, self.attribute_to_skip) {
// parse json. // parse json.
match serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)? { match serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)? {
Value::Object(object) => seek_leaf_values_in_object( Value::Object(object) => seek_leaf_values_in_object(
&object, &object,
self.attribute_to_extract, self.attribute_to_extract,
self.attribute_to_skip, self.attribute_to_skip,
&field_name, field_name,
&mut tokenize_field, &mut tokenize_field,
)?, )?,
Value::Array(array) => seek_leaf_values_in_array( Value::Array(array) => seek_leaf_values_in_array(
&array, &array,
self.attribute_to_extract, self.attribute_to_extract,
self.attribute_to_skip, self.attribute_to_skip,
&field_name, field_name,
&mut tokenize_field, &mut tokenize_field,
)?, )?,
value => tokenize_field(&field_name, &value)?, value => tokenize_field(field_name, &value)?,
} }
} }
} }