From 759b9b15465d0ce80393de803ecab872147ce854 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 25 Sep 2024 14:49:03 +0200 Subject: [PATCH] Introduce a new custom Lru --- Cargo.lock | 11 +- milli/Cargo.toml | 1 + milli/src/update/new/lru.rs | 230 ++++++++++++++++++++++++++++++++++++ milli/src/update/new/mod.rs | 1 + 4 files changed, 238 insertions(+), 5 deletions(-) create mode 100644 milli/src/update/new/lru.rs diff --git a/Cargo.lock b/Cargo.lock index a4a677e73..7b3de4a6a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2307,9 +2307,9 @@ dependencies = [ [[package]] name = "hashbrown" -version = "0.14.3" +version = "0.14.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "290f1a1d9242c78d09ce40a5e87e7554ee637af1351968159f4952f028f75604" +checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" dependencies = [ "ahash 0.8.11", "allocator-api2", @@ -2591,7 +2591,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "168fb715dda47215e360912c096649d23d58bf392ac62f73919e831745e40f26" dependencies = [ "equivalent", - "hashbrown 0.14.3", + "hashbrown 0.14.5", "serde", ] @@ -3318,7 +3318,7 @@ version = "0.12.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37ee39891760e7d94734f6f63fedc29a2e4a152f836120753a72503f09fcf904" dependencies = [ - "hashbrown 0.14.3", + "hashbrown 0.14.5", ] [[package]] @@ -3575,6 +3575,7 @@ dependencies = [ "fxhash", "geoutils", "grenad", + "hashbrown 0.14.5", "heed", "hf-hub", "indexmap", @@ -6049,7 +6050,7 @@ version = "0.16.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "471d1c1645d361eb782a1650b1786a8fb58dd625e681a04c09f5ff7c8764a7b0" dependencies = [ - "hashbrown 0.14.3", + "hashbrown 0.14.5", "once_cell", ] diff --git a/milli/Cargo.toml b/milli/Cargo.toml index aed966758..19986de01 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -89,6 +89,7 @@ tracing = "0.1.40" ureq = { version = "2.10.0", features = ["json"] } url = "2.5.2" rayon-par-bridge = "0.1.0" +hashbrown = "0.14.5" [dev-dependencies] mimalloc = { version = "0.1.43", default-features = false } diff --git a/milli/src/update/new/lru.rs b/milli/src/update/new/lru.rs new file mode 100644 index 000000000..fef108753 --- /dev/null +++ b/milli/src/update/new/lru.rs @@ -0,0 +1,230 @@ +use std::borrow::Borrow; +use std::hash::{BuildHasher, Hash}; +use std::iter::repeat_with; +use std::mem; +use std::num::NonZeroUsize; + +use hashbrown::hash_map::{DefaultHashBuilder, Entry}; +use hashbrown::HashMap; + +pub struct Lru { + lookup: HashMap, + storage: FixedSizeList>, +} + +impl Lru { + /// Creates a new LRU cache that holds at most `capacity` elements. + pub fn new(capacity: NonZeroUsize) -> Self { + Self { lookup: HashMap::new(), storage: FixedSizeList::new(capacity.get()) } + } +} + +impl Lru { + /// Creates a new LRU cache that holds at most `capacity` elements + /// and uses the provided hash builder to hash keys. + pub fn with_hasher(capacity: NonZeroUsize, hash_builder: S) -> Lru { + Self { + lookup: HashMap::with_hasher(hash_builder), + storage: FixedSizeList::new(capacity.get()), + } + } +} + +impl Lru { + /// Returns a mutable reference to the value of the key in the cache or `None` if it is not present in the cache. + /// + /// Moves the key to the head of the LRU list if it exists. + pub fn get_mut(&mut self, key: &Q) -> Option<&mut V> + where + K: Borrow, + Q: Hash + Eq + ?Sized, + { + let idx = *self.lookup.get(key)?; + self.storage.move_front(idx).map(|node| &mut node.value) + } +} + +impl Lru { + pub fn push(&mut self, key: K, value: V) -> Option<(K, V)> { + match self.lookup.entry(key) { + Entry::Occupied(occ) => { + // It's fine to unwrap here because: + // * the entry already exists + let node = self.storage.move_front(*occ.get()).unwrap(); + let old_value = mem::replace(&mut node.value, value); + let old_key = occ.replace_key(); + Some((old_key, old_value)) + } + Entry::Vacant(vac) => { + let key = vac.key().clone(); + if self.storage.is_full() { + let idx = self.storage.back_idx(); + // It's fine to unwrap here because: + // * the cache capacity is non zero + // * the cache is full + let node = self.storage.move_front(idx).unwrap(); + let LruNode { key, value } = mem::replace(node, LruNode { key, value }); + vac.insert(idx); + self.lookup.remove(&key); + Some((key, value)) + } else { + // It's fine to unwrap here because: + // * the cache capacity is non zero + // * the cache is not full + let (idx, _) = self.storage.push_front(LruNode { key, value }).unwrap(); + vac.insert(idx); + None + } + } + } + } +} + +impl IntoIterator for Lru { + type Item = (K, V); + type IntoIter = IntoIter; + + fn into_iter(self) -> Self::IntoIter { + IntoIter { lookup_iter: self.lookup.into_iter(), nodes: self.storage.nodes } + } +} + +pub struct IntoIter { + lookup_iter: hashbrown::hash_map::IntoIter, + nodes: Box<[Option>>]>, +} + +impl Iterator for IntoIter { + type Item = (K, V); + + fn next(&mut self) -> Option { + let (_key, idx) = self.lookup_iter.next()?; + let LruNode { key, value } = self.nodes.get_mut(idx)?.take()?.data; + Some((key, value)) + } +} + +struct LruNode { + key: K, + value: V, +} + +struct FixedSizeListNode { + prev: usize, + next: usize, + data: T, +} + +struct FixedSizeList { + nodes: Box<[Option>]>, + // An un-ordered set of indices that are not in use in `nodes`. + // All `None` entries in `nodes` _must_ be listed in `free`. + // A `Vec` was choosen in order to have O(1) complexity + // for pop and avoid having to go through `nodes` in order to + // to find a free place. + // TODO remove the free list as it is always growing: + // we cannot remove entries from the map. + // Also, we probably do not need one of the front and back cursors. + free: Vec, + front: usize, + back: usize, +} + +impl FixedSizeList { + fn new(capacity: usize) -> Self { + Self { + nodes: repeat_with(|| None).take(capacity).collect::>().into_boxed_slice(), + free: (0..capacity).collect(), + front: usize::MAX, + back: usize::MAX, + } + } + + #[inline] + fn capacity(&self) -> usize { + self.nodes.len() + } + + #[inline] + fn len(&self) -> usize { + self.nodes.len() - self.free.len() + } + + #[inline] + fn is_empty(&self) -> bool { + self.len() == 0 + } + + #[inline] + fn is_full(&self) -> bool { + self.len() == self.capacity() + } + + #[inline] + fn back_idx(&self) -> usize { + self.back + } + + #[inline] + fn next(&mut self) -> Option { + self.free.pop() + } + + #[inline] + fn node_mut(&mut self, idx: usize) -> Option<&mut FixedSizeListNode> { + self.nodes.get_mut(idx).and_then(|node| node.as_mut()) + } + + #[inline] + fn node_ref(&self, idx: usize) -> Option<&FixedSizeListNode> { + self.nodes.get(idx).and_then(|node| node.as_ref()) + } + + #[inline] + fn move_front(&mut self, idx: usize) -> Option<&mut T> { + let node = self.nodes.get_mut(idx)?.take()?; + if let Some(prev) = self.node_mut(node.prev) { + prev.next = node.next; + } else { + self.front = node.next; + } + if let Some(next) = self.node_mut(node.next) { + next.prev = node.prev; + } else { + self.back = node.prev; + } + + if let Some(front) = self.node_mut(self.front) { + front.prev = idx; + } + if self.node_ref(self.back).is_none() { + self.back = idx; + } + + let node = self.nodes.get_mut(idx).unwrap().insert(FixedSizeListNode { + prev: usize::MAX, + next: self.front, + data: node.data, + }); + self.front = idx; + Some(&mut node.data) + } + + #[inline] + fn push_front(&mut self, data: T) -> Option<(usize, &mut T)> { + let idx = self.next()?; + if let Some(front) = self.node_mut(self.front) { + front.prev = idx; + } + if self.node_ref(self.back).is_none() { + self.back = idx; + } + let node = self.nodes.get_mut(idx).unwrap().insert(FixedSizeListNode { + prev: usize::MAX, + next: self.front, + data, + }); + self.front = idx; + Some((idx, &mut node.data)) + } +} diff --git a/milli/src/update/new/mod.rs b/milli/src/update/new/mod.rs index dedd89497..b4878a8fe 100644 --- a/milli/src/update/new/mod.rs +++ b/milli/src/update/new/mod.rs @@ -10,6 +10,7 @@ mod document_change; mod extract; pub mod indexer; mod items_pool; +mod lru; mod merger; mod top_level_map; mod word_fst_builder;