diff --git a/milli/src/update/new/extract/cache.rs b/milli/src/update/new/extract/cache.rs index 00b4dc46d..19d5ba04a 100644 --- a/milli/src/update/new/extract/cache.rs +++ b/milli/src/update/new/extract/cache.rs @@ -15,6 +15,8 @@ pub struct CboCachedSorter { sorter: Sorter, deladd_buffer: Vec, cbo_buffer: Vec, + total_insertions: usize, + fitted_in_key: usize, } impl CboCachedSorter { @@ -24,6 +26,8 @@ impl CboCachedSorter { sorter, deladd_buffer: Vec::new(), cbo_buffer: Vec::new(), + total_insertions: 0, + fitted_in_key: 0, } } } @@ -35,6 +39,8 @@ impl CboCachedSorter { del.get_or_insert_with(PushOptimizedBitmap::default).insert(n); } None => { + self.total_insertions += 1; + self.fitted_in_key += (key.len() <= 20) as usize; let value = DelAddRoaringBitmap::new_del_u32(n); if let Some((key, deladd)) = self.cache.push(key.into(), value) { self.write_entry(key, deladd)?; @@ -55,6 +61,8 @@ impl CboCachedSorter { del.get_or_insert_with(PushOptimizedBitmap::default).union_with_bitmap(bitmap); } None => { + self.total_insertions += 1; + self.fitted_in_key += (key.len() <= 20) as usize; let value = DelAddRoaringBitmap::new_del(bitmap); if let Some((key, deladd)) = self.cache.push(key.into(), value) { self.write_entry(key, deladd)?; @@ -71,6 +79,8 @@ impl CboCachedSorter { add.get_or_insert_with(PushOptimizedBitmap::default).insert(n); } None => { + self.total_insertions += 1; + self.fitted_in_key += (key.len() <= 20) as usize; let value = DelAddRoaringBitmap::new_add_u32(n); if let Some((key, deladd)) = self.cache.push(key.into(), value) { self.write_entry(key, deladd)?; @@ -91,6 +101,8 @@ impl CboCachedSorter { add.get_or_insert_with(PushOptimizedBitmap::default).union_with_bitmap(bitmap); } None => { + self.total_insertions += 1; + self.fitted_in_key += (key.len() <= 20) as usize; let value = DelAddRoaringBitmap::new_add(bitmap); if let Some((key, deladd)) = self.cache.push(key.into(), value) { self.write_entry(key, deladd)?; @@ -108,6 +120,8 @@ impl CboCachedSorter { add.get_or_insert_with(PushOptimizedBitmap::default).insert(n); } None => { + self.total_insertions += 1; + self.fitted_in_key += (key.len() <= 20) as usize; let value = DelAddRoaringBitmap::new_del_add_u32(n); if let Some((key, deladd)) = self.cache.push(key.into(), value) { self.write_entry(key, deladd)?; @@ -161,14 +175,22 @@ impl CboCachedSorter { for (key, deladd) in mem::replace(&mut self.cache, default_arc) { self.write_entry(key, deladd)?; } + + tracing::info!( + "LruCache stats: {} <= 20 bytes ({}%) on a total of {} insertions", + self.fitted_in_key, + (self.fitted_in_key as f32 / self.total_insertions as f32) * 100.0, + self.total_insertions, + ); + Ok(self.sorter) } } #[derive(Debug, Clone)] pub struct DelAddRoaringBitmap { - pub del: Option, - pub add: Option, + pub(crate) del: Option, + pub(crate) add: Option, } impl DelAddRoaringBitmap { diff --git a/milli/src/update/new/extract/searchable/extract_word_docids.rs b/milli/src/update/new/extract/searchable/extract_word_docids.rs index 652964b11..c5c3cd2a2 100644 --- a/milli/src/update/new/extract/searchable/extract_word_docids.rs +++ b/milli/src/update/new/extract/searchable/extract_word_docids.rs @@ -1,26 +1,21 @@ +use std::borrow::Cow; use std::collections::HashMap; -use std::{borrow::Cow, fs::File, num::NonZero}; +use std::fs::File; +use std::num::NonZero; -use grenad::Merger; -use grenad::MergerBuilder; +use grenad::{Merger, MergerBuilder}; use heed::RoTxn; -use rayon::iter::IntoParallelIterator; -use rayon::iter::ParallelIterator; +use rayon::iter::{IntoParallelIterator, ParallelIterator}; -use super::{ - tokenize_document::{tokenizer_builder, DocumentTokenizer}, - SearchableExtractor, -}; +use super::tokenize_document::{tokenizer_builder, DocumentTokenizer}; +use super::SearchableExtractor; +use crate::update::new::extract::cache::CboCachedSorter; use crate::update::new::extract::perm_json_p::contained_in; -use crate::DocumentId; +use crate::update::new::{DocumentChange, ItemsPool}; +use crate::update::{create_sorter, GrenadParameters, MergeDeladdCboRoaringBitmaps}; use crate::{ - bucketed_position, - update::{ - create_sorter, - new::{extract::cache::CboCachedSorter, DocumentChange, ItemsPool}, - GrenadParameters, MergeDeladdCboRoaringBitmaps, - }, - FieldId, GlobalFieldsIdsMap, Index, Result, MAX_POSITION_PER_ATTRIBUTE, + bucketed_position, DocumentId, FieldId, GlobalFieldsIdsMap, Index, Result, + MAX_POSITION_PER_ATTRIBUTE, }; const MAX_COUNTED_WORDS: usize = 30; @@ -565,7 +560,7 @@ impl WordDocidsExtractors { cached_sorter: &mut WordDocidsCachedSorters, document_change: DocumentChange, ) -> Result<()> { - let exact_attributes = index.exact_attributes(&rtxn)?; + let exact_attributes = index.exact_attributes(rtxn)?; let is_exact_attribute = |fname: &str| exact_attributes.iter().any(|attr| contained_in(fname, attr)); let mut buffer = Vec::new(); diff --git a/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs b/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs index 3a6bb7894..ce8136260 100644 --- a/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs +++ b/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs @@ -59,7 +59,7 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor { DocumentChange::Update(inner) => { let document = inner.current(rtxn, index)?.unwrap(); process_document_tokens( - &document, + document, document_tokenizer, fields_ids_map, &mut word_positions, diff --git a/milli/src/update/new/extract/searchable/tokenize_document.rs b/milli/src/update/new/extract/searchable/tokenize_document.rs index 829bf8a49..d2795114e 100644 --- a/milli/src/update/new/extract/searchable/tokenize_document.rs +++ b/milli/src/update/new/extract/searchable/tokenize_document.rs @@ -92,24 +92,24 @@ impl<'a> DocumentTokenizer<'a> { }; // if the current field is searchable or contains a searchable attribute - if select_field(&field_name, self.attribute_to_extract, self.attribute_to_skip) { + if select_field(field_name, self.attribute_to_extract, self.attribute_to_skip) { // parse json. match serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)? { Value::Object(object) => seek_leaf_values_in_object( &object, self.attribute_to_extract, self.attribute_to_skip, - &field_name, + field_name, &mut tokenize_field, )?, Value::Array(array) => seek_leaf_values_in_array( &array, self.attribute_to_extract, self.attribute_to_skip, - &field_name, + field_name, &mut tokenize_field, )?, - value => tokenize_field(&field_name, &value)?, + value => tokenize_field(field_name, &value)?, } } }