Measure the SmallVec efficacity

This commit is contained in:
Clément Renault 2024-09-24 15:32:15 +02:00
parent 4ce5d3d66d
commit 7f148c127c
No known key found for this signature in database
GPG Key ID: F250A4C4E3AE5F5F
4 changed files with 42 additions and 25 deletions

View File

@ -15,6 +15,8 @@ pub struct CboCachedSorter<MF> {
sorter: Sorter<MF>,
deladd_buffer: Vec<u8>,
cbo_buffer: Vec<u8>,
total_insertions: usize,
fitted_in_key: usize,
}
impl<MF> CboCachedSorter<MF> {
@ -24,6 +26,8 @@ impl<MF> CboCachedSorter<MF> {
sorter,
deladd_buffer: Vec::new(),
cbo_buffer: Vec::new(),
total_insertions: 0,
fitted_in_key: 0,
}
}
}
@ -35,6 +39,8 @@ impl<MF: MergeFunction> CboCachedSorter<MF> {
del.get_or_insert_with(PushOptimizedBitmap::default).insert(n);
}
None => {
self.total_insertions += 1;
self.fitted_in_key += (key.len() <= 20) as usize;
let value = DelAddRoaringBitmap::new_del_u32(n);
if let Some((key, deladd)) = self.cache.push(key.into(), value) {
self.write_entry(key, deladd)?;
@ -55,6 +61,8 @@ impl<MF: MergeFunction> CboCachedSorter<MF> {
del.get_or_insert_with(PushOptimizedBitmap::default).union_with_bitmap(bitmap);
}
None => {
self.total_insertions += 1;
self.fitted_in_key += (key.len() <= 20) as usize;
let value = DelAddRoaringBitmap::new_del(bitmap);
if let Some((key, deladd)) = self.cache.push(key.into(), value) {
self.write_entry(key, deladd)?;
@ -71,6 +79,8 @@ impl<MF: MergeFunction> CboCachedSorter<MF> {
add.get_or_insert_with(PushOptimizedBitmap::default).insert(n);
}
None => {
self.total_insertions += 1;
self.fitted_in_key += (key.len() <= 20) as usize;
let value = DelAddRoaringBitmap::new_add_u32(n);
if let Some((key, deladd)) = self.cache.push(key.into(), value) {
self.write_entry(key, deladd)?;
@ -91,6 +101,8 @@ impl<MF: MergeFunction> CboCachedSorter<MF> {
add.get_or_insert_with(PushOptimizedBitmap::default).union_with_bitmap(bitmap);
}
None => {
self.total_insertions += 1;
self.fitted_in_key += (key.len() <= 20) as usize;
let value = DelAddRoaringBitmap::new_add(bitmap);
if let Some((key, deladd)) = self.cache.push(key.into(), value) {
self.write_entry(key, deladd)?;
@ -108,6 +120,8 @@ impl<MF: MergeFunction> CboCachedSorter<MF> {
add.get_or_insert_with(PushOptimizedBitmap::default).insert(n);
}
None => {
self.total_insertions += 1;
self.fitted_in_key += (key.len() <= 20) as usize;
let value = DelAddRoaringBitmap::new_del_add_u32(n);
if let Some((key, deladd)) = self.cache.push(key.into(), value) {
self.write_entry(key, deladd)?;
@ -161,14 +175,22 @@ impl<MF: MergeFunction> CboCachedSorter<MF> {
for (key, deladd) in mem::replace(&mut self.cache, default_arc) {
self.write_entry(key, deladd)?;
}
tracing::info!(
"LruCache stats: {} <= 20 bytes ({}%) on a total of {} insertions",
self.fitted_in_key,
(self.fitted_in_key as f32 / self.total_insertions as f32) * 100.0,
self.total_insertions,
);
Ok(self.sorter)
}
}
#[derive(Debug, Clone)]
pub struct DelAddRoaringBitmap {
pub del: Option<PushOptimizedBitmap>,
pub add: Option<PushOptimizedBitmap>,
pub(crate) del: Option<PushOptimizedBitmap>,
pub(crate) add: Option<PushOptimizedBitmap>,
}
impl DelAddRoaringBitmap {

View File

@ -1,26 +1,21 @@
use std::borrow::Cow;
use std::collections::HashMap;
use std::{borrow::Cow, fs::File, num::NonZero};
use std::fs::File;
use std::num::NonZero;
use grenad::Merger;
use grenad::MergerBuilder;
use grenad::{Merger, MergerBuilder};
use heed::RoTxn;
use rayon::iter::IntoParallelIterator;
use rayon::iter::ParallelIterator;
use rayon::iter::{IntoParallelIterator, ParallelIterator};
use super::{
tokenize_document::{tokenizer_builder, DocumentTokenizer},
SearchableExtractor,
};
use super::tokenize_document::{tokenizer_builder, DocumentTokenizer};
use super::SearchableExtractor;
use crate::update::new::extract::cache::CboCachedSorter;
use crate::update::new::extract::perm_json_p::contained_in;
use crate::DocumentId;
use crate::update::new::{DocumentChange, ItemsPool};
use crate::update::{create_sorter, GrenadParameters, MergeDeladdCboRoaringBitmaps};
use crate::{
bucketed_position,
update::{
create_sorter,
new::{extract::cache::CboCachedSorter, DocumentChange, ItemsPool},
GrenadParameters, MergeDeladdCboRoaringBitmaps,
},
FieldId, GlobalFieldsIdsMap, Index, Result, MAX_POSITION_PER_ATTRIBUTE,
bucketed_position, DocumentId, FieldId, GlobalFieldsIdsMap, Index, Result,
MAX_POSITION_PER_ATTRIBUTE,
};
const MAX_COUNTED_WORDS: usize = 30;
@ -565,7 +560,7 @@ impl WordDocidsExtractors {
cached_sorter: &mut WordDocidsCachedSorters,
document_change: DocumentChange,
) -> Result<()> {
let exact_attributes = index.exact_attributes(&rtxn)?;
let exact_attributes = index.exact_attributes(rtxn)?;
let is_exact_attribute =
|fname: &str| exact_attributes.iter().any(|attr| contained_in(fname, attr));
let mut buffer = Vec::new();

View File

@ -59,7 +59,7 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor {
DocumentChange::Update(inner) => {
let document = inner.current(rtxn, index)?.unwrap();
process_document_tokens(
&document,
document,
document_tokenizer,
fields_ids_map,
&mut word_positions,

View File

@ -92,24 +92,24 @@ impl<'a> DocumentTokenizer<'a> {
};
// if the current field is searchable or contains a searchable attribute
if select_field(&field_name, self.attribute_to_extract, self.attribute_to_skip) {
if select_field(field_name, self.attribute_to_extract, self.attribute_to_skip) {
// parse json.
match serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)? {
Value::Object(object) => seek_leaf_values_in_object(
&object,
self.attribute_to_extract,
self.attribute_to_skip,
&field_name,
field_name,
&mut tokenize_field,
)?,
Value::Array(array) => seek_leaf_values_in_array(
&array,
self.attribute_to_extract,
self.attribute_to_skip,
&field_name,
field_name,
&mut tokenize_field,
)?,
value => tokenize_field(&field_name, &value)?,
value => tokenize_field(field_name, &value)?,
}
}
}