Reduce number of cache created by using thread_local

This commit is contained in:
ManyTheFish 2024-10-07 15:58:16 +02:00
parent 58d96fbea3
commit c11b7e5c0f
5 changed files with 68 additions and 41 deletions

5
Cargo.lock generated
View File

@ -3598,6 +3598,7 @@ dependencies = [
"smartstring", "smartstring",
"tempfile", "tempfile",
"thiserror", "thiserror",
"thread_local",
"tiktoken-rs", "tiktoken-rs",
"time", "time",
"tokenizers", "tokenizers",
@ -5332,9 +5333,9 @@ dependencies = [
[[package]] [[package]]
name = "thread_local" name = "thread_local"
version = "1.1.7" version = "1.1.8"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3fdd6f064ccff2d6567adcb3873ca630700f00b5ad3f060c25b5dcfd9a4ce152" checksum = "8b9ef9bad013ada3808854ceac7b46812a6465ba368859a37e2100283d2d719c"
dependencies = [ dependencies = [
"cfg-if", "cfg-if",
"once_cell", "once_cell",

View File

@ -89,6 +89,7 @@ ureq = { version = "2.10.0", features = ["json"] }
url = "2.5.2" url = "2.5.2"
rayon-par-bridge = "0.1.0" rayon-par-bridge = "0.1.0"
hashbrown = "0.14.5" hashbrown = "0.14.5"
thread_local = "1.1.8"
[dev-dependencies] [dev-dependencies]
mimalloc = { version = "0.1.43", default-features = false } mimalloc = { version = "0.1.43", default-features = false }

View File

@ -1,3 +1,4 @@
use std::cell::RefCell;
use std::collections::HashSet; use std::collections::HashSet;
use std::fmt::Debug; use std::fmt::Debug;
use std::fs::File; use std::fs::File;
@ -7,6 +8,7 @@ use grenad::{MergeFunction, Merger};
use heed::RoTxn; use heed::RoTxn;
use rayon::iter::{IntoParallelIterator, ParallelBridge, ParallelIterator}; use rayon::iter::{IntoParallelIterator, ParallelBridge, ParallelIterator};
use serde_json::Value; use serde_json::Value;
use thread_local::ThreadLocal;
use super::super::cache::CboCachedSorter; use super::super::cache::CboCachedSorter;
use super::facet_document::extract_document_facets; use super::facet_document::extract_document_facets;
@ -216,11 +218,13 @@ impl DocidsExtractor for FacetedDocidsExtractor {
let span = let span =
tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction"); tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction");
let _entered = span.enter(); let _entered = span.enter();
let local = ThreadLocal::new();
document_changes.into_par_iter().try_arc_for_each_try_init( document_changes.into_par_iter().try_arc_for_each_try_init(
|| { || {
local.get_or_try(|| {
let rtxn = index.read_txn().map_err(Error::from)?; let rtxn = index.read_txn().map_err(Error::from)?;
let cache = caches.push(CboCachedSorter::new( let cache = caches.push(CboCachedSorter::new(
// TODO use a better value /// TODO use a better value
100.try_into().unwrap(), 100.try_into().unwrap(),
create_sorter( create_sorter(
grenad::SortAlgorithm::Stable, grenad::SortAlgorithm::Stable,
@ -231,9 +235,11 @@ impl DocidsExtractor for FacetedDocidsExtractor {
max_memory, max_memory,
), ),
)); ));
Ok((rtxn, fields_ids_map.clone(), Vec::new(), cache)) Ok((rtxn, RefCell::new((fields_ids_map.clone(), Vec::new(), cache))))
})
}, },
|(rtxn, fields_ids_map, buffer, cached_sorter), document_change| { |(rtxn, rc), document_change| {
let (fields_ids_map, buffer, cached_sorter) = &mut *rc.borrow_mut();
Self::extract_document_change( Self::extract_document_change(
rtxn, rtxn,
index, index,

View File

@ -1,3 +1,4 @@
use std::cell::RefCell;
use std::collections::HashMap; use std::collections::HashMap;
use std::fs::File; use std::fs::File;
use std::num::NonZero; use std::num::NonZero;
@ -6,6 +7,7 @@ use std::sync::Arc;
use grenad::{Merger, MergerBuilder}; use grenad::{Merger, MergerBuilder};
use heed::RoTxn; use heed::RoTxn;
use rayon::iter::IntoParallelIterator; use rayon::iter::IntoParallelIterator;
use thread_local::ThreadLocal;
use super::tokenize_document::{tokenizer_builder, DocumentTokenizer}; use super::tokenize_document::{tokenizer_builder, DocumentTokenizer};
use super::SearchableExtractor; use super::SearchableExtractor;
@ -347,18 +349,23 @@ impl WordDocidsExtractors {
let span = let span =
tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction"); tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction");
let _entered = span.enter(); let _entered = span.enter();
let local = ThreadLocal::new();
document_changes.into_par_iter().try_arc_for_each_try_init( document_changes.into_par_iter().try_arc_for_each_try_init(
|| { || {
local.get_or_try(|| {
let rtxn = index.read_txn().map_err(Error::from)?; let rtxn = index.read_txn().map_err(Error::from)?;
let fields_ids_map = fields_ids_map.clone();
let cache = caches.push(WordDocidsCachedSorters::new( let cache = caches.push(WordDocidsCachedSorters::new(
indexer, indexer,
max_memory, max_memory,
// TODO use a better value // TODO use a better value
200_000.try_into().unwrap(), 200_000.try_into().unwrap(),
)); ));
Ok((rtxn, &document_tokenizer, fields_ids_map.clone(), cache)) Ok((rtxn, &document_tokenizer, RefCell::new((fields_ids_map, cache))))
})
}, },
|(rtxn, document_tokenizer, fields_ids_map, cached_sorter), document_change| { |(rtxn, document_tokenizer, rc), document_change| {
let (fields_ids_map, cached_sorter) = &mut *rc.borrow_mut();
Self::extract_document_change( Self::extract_document_change(
rtxn, rtxn,
index, index,
@ -377,7 +384,9 @@ impl WordDocidsExtractors {
tracing::trace_span!(target: "indexing::documents::extract", "merger_building"); tracing::trace_span!(target: "indexing::documents::extract", "merger_building");
let _entered = span.enter(); let _entered = span.enter();
let mut builder = WordDocidsMergerBuilders::new(); let mut builder = WordDocidsMergerBuilders::new();
let mut count = 0;
for cache in caches.into_iter() { for cache in caches.into_iter() {
count += 1;
builder.add_sorters(cache)?; builder.add_sorters(cache)?;
} }

View File

@ -2,6 +2,7 @@ mod extract_word_docids;
mod extract_word_pair_proximity_docids; mod extract_word_pair_proximity_docids;
mod tokenize_document; mod tokenize_document;
use std::cell::RefCell;
use std::fs::File; use std::fs::File;
use std::sync::Arc; use std::sync::Arc;
@ -10,6 +11,7 @@ pub use extract_word_pair_proximity_docids::WordPairProximityDocidsExtractor;
use grenad::Merger; use grenad::Merger;
use heed::RoTxn; use heed::RoTxn;
use rayon::iter::{IntoParallelIterator, ParallelBridge, ParallelIterator}; use rayon::iter::{IntoParallelIterator, ParallelBridge, ParallelIterator};
use thread_local::ThreadLocal;
use tokenize_document::{tokenizer_builder, DocumentTokenizer}; use tokenize_document::{tokenizer_builder, DocumentTokenizer};
use super::cache::CboCachedSorter; use super::cache::CboCachedSorter;
@ -64,11 +66,13 @@ pub trait SearchableExtractor {
let span = let span =
tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction"); tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction");
let _entered = span.enter(); let _entered = span.enter();
let local = ThreadLocal::new();
document_changes.into_par_iter().try_arc_for_each_try_init( document_changes.into_par_iter().try_arc_for_each_try_init(
|| { || {
local.get_or_try(|| {
let rtxn = index.read_txn().map_err(Error::from)?; let rtxn = index.read_txn().map_err(Error::from)?;
let cache = caches.push(CboCachedSorter::new( let cache = caches.push(CboCachedSorter::new(
// TODO use a better value /// TODO use a better value
1_000_000.try_into().unwrap(), 1_000_000.try_into().unwrap(),
create_sorter( create_sorter(
grenad::SortAlgorithm::Stable, grenad::SortAlgorithm::Stable,
@ -79,9 +83,15 @@ pub trait SearchableExtractor {
max_memory, max_memory,
), ),
)); ));
Ok((rtxn, &document_tokenizer, fields_ids_map.clone(), cache)) Ok((
rtxn,
&document_tokenizer,
RefCell::new((fields_ids_map.clone(), cache)),
))
})
}, },
|(rtxn, document_tokenizer, fields_ids_map, cached_sorter), document_change| { |(rtxn, document_tokenizer, rc), document_change| {
let (fields_ids_map, cached_sorter) = &mut *rc.borrow_mut();
Self::extract_document_change( Self::extract_document_change(
rtxn, rtxn,
index, index,