Introduce a new custom Lru

This commit is contained in:
Clément Renault 2024-09-25 14:49:03 +02:00
parent 3f7a500f3b
commit 759b9b1546
No known key found for this signature in database
GPG Key ID: F250A4C4E3AE5F5F
4 changed files with 238 additions and 5 deletions

11
Cargo.lock generated
View File

@ -2307,9 +2307,9 @@ dependencies = [
[[package]] [[package]]
name = "hashbrown" name = "hashbrown"
version = "0.14.3" version = "0.14.5"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "290f1a1d9242c78d09ce40a5e87e7554ee637af1351968159f4952f028f75604" checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1"
dependencies = [ dependencies = [
"ahash 0.8.11", "ahash 0.8.11",
"allocator-api2", "allocator-api2",
@ -2591,7 +2591,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "168fb715dda47215e360912c096649d23d58bf392ac62f73919e831745e40f26" checksum = "168fb715dda47215e360912c096649d23d58bf392ac62f73919e831745e40f26"
dependencies = [ dependencies = [
"equivalent", "equivalent",
"hashbrown 0.14.3", "hashbrown 0.14.5",
"serde", "serde",
] ]
@ -3318,7 +3318,7 @@ version = "0.12.4"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "37ee39891760e7d94734f6f63fedc29a2e4a152f836120753a72503f09fcf904" checksum = "37ee39891760e7d94734f6f63fedc29a2e4a152f836120753a72503f09fcf904"
dependencies = [ dependencies = [
"hashbrown 0.14.3", "hashbrown 0.14.5",
] ]
[[package]] [[package]]
@ -3575,6 +3575,7 @@ dependencies = [
"fxhash", "fxhash",
"geoutils", "geoutils",
"grenad", "grenad",
"hashbrown 0.14.5",
"heed", "heed",
"hf-hub", "hf-hub",
"indexmap", "indexmap",
@ -6049,7 +6050,7 @@ version = "0.16.4"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "471d1c1645d361eb782a1650b1786a8fb58dd625e681a04c09f5ff7c8764a7b0" checksum = "471d1c1645d361eb782a1650b1786a8fb58dd625e681a04c09f5ff7c8764a7b0"
dependencies = [ dependencies = [
"hashbrown 0.14.3", "hashbrown 0.14.5",
"once_cell", "once_cell",
] ]

View File

@ -89,6 +89,7 @@ tracing = "0.1.40"
ureq = { version = "2.10.0", features = ["json"] } ureq = { version = "2.10.0", features = ["json"] }
url = "2.5.2" url = "2.5.2"
rayon-par-bridge = "0.1.0" rayon-par-bridge = "0.1.0"
hashbrown = "0.14.5"
[dev-dependencies] [dev-dependencies]
mimalloc = { version = "0.1.43", default-features = false } mimalloc = { version = "0.1.43", default-features = false }

230
milli/src/update/new/lru.rs Normal file
View File

@ -0,0 +1,230 @@
use std::borrow::Borrow;
use std::hash::{BuildHasher, Hash};
use std::iter::repeat_with;
use std::mem;
use std::num::NonZeroUsize;
use hashbrown::hash_map::{DefaultHashBuilder, Entry};
use hashbrown::HashMap;
pub struct Lru<K, V, S = DefaultHashBuilder> {
lookup: HashMap<K, usize, S>,
storage: FixedSizeList<LruNode<K, V>>,
}
impl<K: Eq + Hash, V> Lru<K, V> {
/// Creates a new LRU cache that holds at most `capacity` elements.
pub fn new(capacity: NonZeroUsize) -> Self {
Self { lookup: HashMap::new(), storage: FixedSizeList::new(capacity.get()) }
}
}
impl<K: Eq + Hash, V, S: BuildHasher> Lru<K, V, S> {
/// Creates a new LRU cache that holds at most `capacity` elements
/// and uses the provided hash builder to hash keys.
pub fn with_hasher(capacity: NonZeroUsize, hash_builder: S) -> Lru<K, V, S> {
Self {
lookup: HashMap::with_hasher(hash_builder),
storage: FixedSizeList::new(capacity.get()),
}
}
}
impl<K: Eq + Hash, V, S: BuildHasher> Lru<K, V, S> {
/// Returns a mutable reference to the value of the key in the cache or `None` if it is not present in the cache.
///
/// Moves the key to the head of the LRU list if it exists.
pub fn get_mut<Q>(&mut self, key: &Q) -> Option<&mut V>
where
K: Borrow<Q>,
Q: Hash + Eq + ?Sized,
{
let idx = *self.lookup.get(key)?;
self.storage.move_front(idx).map(|node| &mut node.value)
}
}
impl<K: Clone + Eq + Hash, V, S: BuildHasher> Lru<K, V, S> {
pub fn push(&mut self, key: K, value: V) -> Option<(K, V)> {
match self.lookup.entry(key) {
Entry::Occupied(occ) => {
// It's fine to unwrap here because:
// * the entry already exists
let node = self.storage.move_front(*occ.get()).unwrap();
let old_value = mem::replace(&mut node.value, value);
let old_key = occ.replace_key();
Some((old_key, old_value))
}
Entry::Vacant(vac) => {
let key = vac.key().clone();
if self.storage.is_full() {
let idx = self.storage.back_idx();
// It's fine to unwrap here because:
// * the cache capacity is non zero
// * the cache is full
let node = self.storage.move_front(idx).unwrap();
let LruNode { key, value } = mem::replace(node, LruNode { key, value });
vac.insert(idx);
self.lookup.remove(&key);
Some((key, value))
} else {
// It's fine to unwrap here because:
// * the cache capacity is non zero
// * the cache is not full
let (idx, _) = self.storage.push_front(LruNode { key, value }).unwrap();
vac.insert(idx);
None
}
}
}
}
}
impl<K, V, S> IntoIterator for Lru<K, V, S> {
type Item = (K, V);
type IntoIter = IntoIter<K, V>;
fn into_iter(self) -> Self::IntoIter {
IntoIter { lookup_iter: self.lookup.into_iter(), nodes: self.storage.nodes }
}
}
pub struct IntoIter<K, V> {
lookup_iter: hashbrown::hash_map::IntoIter<K, usize>,
nodes: Box<[Option<FixedSizeListNode<LruNode<K, V>>>]>,
}
impl<K, V> Iterator for IntoIter<K, V> {
type Item = (K, V);
fn next(&mut self) -> Option<Self::Item> {
let (_key, idx) = self.lookup_iter.next()?;
let LruNode { key, value } = self.nodes.get_mut(idx)?.take()?.data;
Some((key, value))
}
}
struct LruNode<K, V> {
key: K,
value: V,
}
struct FixedSizeListNode<T> {
prev: usize,
next: usize,
data: T,
}
struct FixedSizeList<T> {
nodes: Box<[Option<FixedSizeListNode<T>>]>,
// An un-ordered set of indices that are not in use in `nodes`.
// All `None` entries in `nodes` _must_ be listed in `free`.
// A `Vec<usize>` was choosen in order to have O(1) complexity
// for pop and avoid having to go through `nodes` in order to
// to find a free place.
// TODO remove the free list as it is always growing:
// we cannot remove entries from the map.
// Also, we probably do not need one of the front and back cursors.
free: Vec<usize>,
front: usize,
back: usize,
}
impl<T> FixedSizeList<T> {
fn new(capacity: usize) -> Self {
Self {
nodes: repeat_with(|| None).take(capacity).collect::<Vec<_>>().into_boxed_slice(),
free: (0..capacity).collect(),
front: usize::MAX,
back: usize::MAX,
}
}
#[inline]
fn capacity(&self) -> usize {
self.nodes.len()
}
#[inline]
fn len(&self) -> usize {
self.nodes.len() - self.free.len()
}
#[inline]
fn is_empty(&self) -> bool {
self.len() == 0
}
#[inline]
fn is_full(&self) -> bool {
self.len() == self.capacity()
}
#[inline]
fn back_idx(&self) -> usize {
self.back
}
#[inline]
fn next(&mut self) -> Option<usize> {
self.free.pop()
}
#[inline]
fn node_mut(&mut self, idx: usize) -> Option<&mut FixedSizeListNode<T>> {
self.nodes.get_mut(idx).and_then(|node| node.as_mut())
}
#[inline]
fn node_ref(&self, idx: usize) -> Option<&FixedSizeListNode<T>> {
self.nodes.get(idx).and_then(|node| node.as_ref())
}
#[inline]
fn move_front(&mut self, idx: usize) -> Option<&mut T> {
let node = self.nodes.get_mut(idx)?.take()?;
if let Some(prev) = self.node_mut(node.prev) {
prev.next = node.next;
} else {
self.front = node.next;
}
if let Some(next) = self.node_mut(node.next) {
next.prev = node.prev;
} else {
self.back = node.prev;
}
if let Some(front) = self.node_mut(self.front) {
front.prev = idx;
}
if self.node_ref(self.back).is_none() {
self.back = idx;
}
let node = self.nodes.get_mut(idx).unwrap().insert(FixedSizeListNode {
prev: usize::MAX,
next: self.front,
data: node.data,
});
self.front = idx;
Some(&mut node.data)
}
#[inline]
fn push_front(&mut self, data: T) -> Option<(usize, &mut T)> {
let idx = self.next()?;
if let Some(front) = self.node_mut(self.front) {
front.prev = idx;
}
if self.node_ref(self.back).is_none() {
self.back = idx;
}
let node = self.nodes.get_mut(idx).unwrap().insert(FixedSizeListNode {
prev: usize::MAX,
next: self.front,
data,
});
self.front = idx;
Some((idx, &mut node.data))
}
}

View File

@ -10,6 +10,7 @@ mod document_change;
mod extract; mod extract;
pub mod indexer; pub mod indexer;
mod items_pool; mod items_pool;
mod lru;
mod merger; mod merger;
mod top_level_map; mod top_level_map;
mod word_fst_builder; mod word_fst_builder;