Reduce number of cache created by using thread_local

This commit is contained in:
ManyTheFish 2024-10-07 15:58:16 +02:00
parent 58d96fbea3
commit c11b7e5c0f
5 changed files with 68 additions and 41 deletions

5
Cargo.lock generated
View File

@ -3598,6 +3598,7 @@ dependencies = [
"smartstring",
"tempfile",
"thiserror",
"thread_local",
"tiktoken-rs",
"time",
"tokenizers",
@ -5332,9 +5333,9 @@ dependencies = [
[[package]]
name = "thread_local"
version = "1.1.7"
version = "1.1.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3fdd6f064ccff2d6567adcb3873ca630700f00b5ad3f060c25b5dcfd9a4ce152"
checksum = "8b9ef9bad013ada3808854ceac7b46812a6465ba368859a37e2100283d2d719c"
dependencies = [
"cfg-if",
"once_cell",

View File

@ -89,6 +89,7 @@ ureq = { version = "2.10.0", features = ["json"] }
url = "2.5.2"
rayon-par-bridge = "0.1.0"
hashbrown = "0.14.5"
thread_local = "1.1.8"
[dev-dependencies]
mimalloc = { version = "0.1.43", default-features = false }

View File

@ -1,3 +1,4 @@
use std::cell::RefCell;
use std::collections::HashSet;
use std::fmt::Debug;
use std::fs::File;
@ -7,6 +8,7 @@ use grenad::{MergeFunction, Merger};
use heed::RoTxn;
use rayon::iter::{IntoParallelIterator, ParallelBridge, ParallelIterator};
use serde_json::Value;
use thread_local::ThreadLocal;
use super::super::cache::CboCachedSorter;
use super::facet_document::extract_document_facets;
@ -216,11 +218,13 @@ impl DocidsExtractor for FacetedDocidsExtractor {
let span =
tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction");
let _entered = span.enter();
let local = ThreadLocal::new();
document_changes.into_par_iter().try_arc_for_each_try_init(
|| {
local.get_or_try(|| {
let rtxn = index.read_txn().map_err(Error::from)?;
let cache = caches.push(CboCachedSorter::new(
// TODO use a better value
/// TODO use a better value
100.try_into().unwrap(),
create_sorter(
grenad::SortAlgorithm::Stable,
@ -231,9 +235,11 @@ impl DocidsExtractor for FacetedDocidsExtractor {
max_memory,
),
));
Ok((rtxn, fields_ids_map.clone(), Vec::new(), cache))
Ok((rtxn, RefCell::new((fields_ids_map.clone(), Vec::new(), cache))))
})
},
|(rtxn, fields_ids_map, buffer, cached_sorter), document_change| {
|(rtxn, rc), document_change| {
let (fields_ids_map, buffer, cached_sorter) = &mut *rc.borrow_mut();
Self::extract_document_change(
rtxn,
index,

View File

@ -1,3 +1,4 @@
use std::cell::RefCell;
use std::collections::HashMap;
use std::fs::File;
use std::num::NonZero;
@ -6,6 +7,7 @@ use std::sync::Arc;
use grenad::{Merger, MergerBuilder};
use heed::RoTxn;
use rayon::iter::IntoParallelIterator;
use thread_local::ThreadLocal;
use super::tokenize_document::{tokenizer_builder, DocumentTokenizer};
use super::SearchableExtractor;
@ -347,18 +349,23 @@ impl WordDocidsExtractors {
let span =
tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction");
let _entered = span.enter();
let local = ThreadLocal::new();
document_changes.into_par_iter().try_arc_for_each_try_init(
|| {
local.get_or_try(|| {
let rtxn = index.read_txn().map_err(Error::from)?;
let fields_ids_map = fields_ids_map.clone();
let cache = caches.push(WordDocidsCachedSorters::new(
indexer,
max_memory,
// TODO use a better value
200_000.try_into().unwrap(),
));
Ok((rtxn, &document_tokenizer, fields_ids_map.clone(), cache))
Ok((rtxn, &document_tokenizer, RefCell::new((fields_ids_map, cache))))
})
},
|(rtxn, document_tokenizer, fields_ids_map, cached_sorter), document_change| {
|(rtxn, document_tokenizer, rc), document_change| {
let (fields_ids_map, cached_sorter) = &mut *rc.borrow_mut();
Self::extract_document_change(
rtxn,
index,
@ -377,7 +384,9 @@ impl WordDocidsExtractors {
tracing::trace_span!(target: "indexing::documents::extract", "merger_building");
let _entered = span.enter();
let mut builder = WordDocidsMergerBuilders::new();
let mut count = 0;
for cache in caches.into_iter() {
count += 1;
builder.add_sorters(cache)?;
}

View File

@ -2,6 +2,7 @@ mod extract_word_docids;
mod extract_word_pair_proximity_docids;
mod tokenize_document;
use std::cell::RefCell;
use std::fs::File;
use std::sync::Arc;
@ -10,6 +11,7 @@ pub use extract_word_pair_proximity_docids::WordPairProximityDocidsExtractor;
use grenad::Merger;
use heed::RoTxn;
use rayon::iter::{IntoParallelIterator, ParallelBridge, ParallelIterator};
use thread_local::ThreadLocal;
use tokenize_document::{tokenizer_builder, DocumentTokenizer};
use super::cache::CboCachedSorter;
@ -64,11 +66,13 @@ pub trait SearchableExtractor {
let span =
tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction");
let _entered = span.enter();
let local = ThreadLocal::new();
document_changes.into_par_iter().try_arc_for_each_try_init(
|| {
local.get_or_try(|| {
let rtxn = index.read_txn().map_err(Error::from)?;
let cache = caches.push(CboCachedSorter::new(
// TODO use a better value
/// TODO use a better value
1_000_000.try_into().unwrap(),
create_sorter(
grenad::SortAlgorithm::Stable,
@ -79,9 +83,15 @@ pub trait SearchableExtractor {
max_memory,
),
));
Ok((rtxn, &document_tokenizer, fields_ids_map.clone(), cache))
Ok((
rtxn,
&document_tokenizer,
RefCell::new((fields_ids_map.clone(), cache)),
))
})
},
|(rtxn, document_tokenizer, fields_ids_map, cached_sorter), document_change| {
|(rtxn, document_tokenizer, rc), document_change| {
let (fields_ids_map, cached_sorter) = &mut *rc.borrow_mut();
Self::extract_document_change(
rtxn,
index,