Replace the arc cache by a simple linked hash map

This commit is contained in:
Clément Renault 2020-09-23 14:50:52 +02:00
parent 4d22d80281
commit ed05999f63
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4
3 changed files with 28 additions and 30 deletions

18
Cargo.lock generated
View File

@ -12,14 +12,6 @@ version = "1.0.31"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "85bb70cc08ec97ca5450e6eba421deeea5f172c0fc61f78b5357b2a8e8be195f" checksum = "85bb70cc08ec97ca5450e6eba421deeea5f172c0fc61f78b5357b2a8e8be195f"
[[package]]
name = "arc-cache"
version = "0.2.4"
source = "git+https://github.com/Kerollmops/rust-arc-cache.git?rev=56530f2#56530f2d219823f8f88dc03851f8fe057bd72564"
dependencies = [
"xlru-cache",
]
[[package]] [[package]]
name = "arc-swap" name = "arc-swap"
version = "0.4.6" version = "0.4.6"
@ -957,7 +949,6 @@ name = "milli"
version = "0.1.0" version = "0.1.0"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"arc-cache",
"askama", "askama",
"askama_warp", "askama_warp",
"bstr", "bstr",
@ -971,6 +962,7 @@ dependencies = [
"itertools", "itertools",
"jemallocator", "jemallocator",
"levenshtein_automata", "levenshtein_automata",
"linked-hash-map",
"log 0.4.11", "log 0.4.11",
"memmap", "memmap",
"near-proximity", "near-proximity",
@ -2356,14 +2348,6 @@ dependencies = [
"winapi-build", "winapi-build",
] ]
[[package]]
name = "xlru-cache"
version = "0.1.2"
source = "git+https://github.com/Kerollmops/rust-xlru-cache.git?rev=3c90f49#3c90f49e11758ee0cc4ff145b2606ba143188b77"
dependencies = [
"linked-hash-map",
]
[[package]] [[package]]
name = "zerocopy" name = "zerocopy"
version = "0.3.0" version = "0.3.0"

View File

@ -7,7 +7,6 @@ default-run = "indexer"
[dependencies] [dependencies]
anyhow = "1.0.28" anyhow = "1.0.28"
arc-cache = { git = "https://github.com/Kerollmops/rust-arc-cache.git", rev = "56530f2" }
bstr = "0.2.13" bstr = "0.2.13"
byteorder = "1.3.4" byteorder = "1.3.4"
csv = "1.1.3" csv = "1.1.3"
@ -17,6 +16,7 @@ fxhash = "0.2.1"
heed = { version = "0.8.1", default-features = false, features = ["lmdb"] } heed = { version = "0.8.1", default-features = false, features = ["lmdb"] }
jemallocator = "0.3.2" jemallocator = "0.3.2"
levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] } levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] }
linked-hash-map = "0.5.3"
memmap = "0.7.0" memmap = "0.7.0"
near-proximity = { git = "https://github.com/Kerollmops/plane-sweep-proximity", rev = "6608205" } near-proximity = { git = "https://github.com/Kerollmops/plane-sweep-proximity", rev = "6608205" }
once_cell = "1.4.0" once_cell = "1.4.0"

View File

@ -8,12 +8,12 @@ use std::{iter, thread};
use std::time::Instant; use std::time::Instant;
use anyhow::Context; use anyhow::Context;
use arc_cache::ArcCache;
use bstr::ByteSlice as _; use bstr::ByteSlice as _;
use csv::StringRecord; use csv::StringRecord;
use flate2::read::GzDecoder; use flate2::read::GzDecoder;
use fst::IntoStreamer; use fst::IntoStreamer;
use heed::{EnvOpenOptions, BytesEncode, types::*}; use heed::{EnvOpenOptions, BytesEncode, types::*};
use linked_hash_map::LinkedHashMap;
use log::{debug, info}; use log::{debug, info};
use memmap::Mmap; use memmap::Mmap;
use oxidized_mtbl::{Reader, Writer, Merger, Sorter, CompressionType}; use oxidized_mtbl::{Reader, Writer, Merger, Sorter, CompressionType};
@ -89,9 +89,10 @@ struct IndexerOpt {
#[structopt(long, default_value = "1610612736")] // 1.5 GB #[structopt(long, default_value = "1610612736")] // 1.5 GB
max_memory: usize, max_memory: usize,
/// Size of the ARC cache when indexing. /// Size of the linked hash map cache when indexing.
#[structopt(long, default_value = "43690")] /// The bigger it is, the faster the indexing is but the more memory it takes.
arc_cache_size: usize, #[structopt(long, default_value = "4096")]
linked_hash_map_size: usize,
/// The name of the compression algorithm to use when compressing intermediate /// The name of the compression algorithm to use when compressing intermediate
/// chunks during indexing documents. /// chunks during indexing documents.
@ -159,7 +160,7 @@ fn compute_words_pair_proximities(
type MergeFn = fn(&[u8], &[Vec<u8>]) -> Result<Vec<u8>, ()>; type MergeFn = fn(&[u8], &[Vec<u8>]) -> Result<Vec<u8>, ()>;
struct Store { struct Store {
word_docids: ArcCache<SmallVec32<u8>, RoaringBitmap>, word_docids: LinkedHashMap<SmallVec32<u8>, RoaringBitmap>,
documents_ids: RoaringBitmap, documents_ids: RoaringBitmap,
sorter: Sorter<MergeFn>, sorter: Sorter<MergeFn>,
documents_sorter: Sorter<MergeFn>, documents_sorter: Sorter<MergeFn>,
@ -169,7 +170,7 @@ struct Store {
impl Store { impl Store {
pub fn new( pub fn new(
arc_cache_size: usize, linked_hash_map_size: usize,
max_nb_chunks: Option<usize>, max_nb_chunks: Option<usize>,
max_memory: Option<usize>, max_memory: Option<usize>,
chunk_compression_type: CompressionType, chunk_compression_type: CompressionType,
@ -195,7 +196,8 @@ impl Store {
} }
Store { Store {
word_docids: ArcCache::new(arc_cache_size), // We overflow by one before poping the LRU element.
word_docids: LinkedHashMap::with_capacity(linked_hash_map_size + 1),
documents_ids: RoaringBitmap::new(), documents_ids: RoaringBitmap::new(),
sorter: builder.build(), sorter: builder.build(),
documents_sorter: documents_builder.build(), documents_sorter: documents_builder.build(),
@ -207,9 +209,21 @@ impl Store {
// Save the documents ids under the position and word we have seen it. // Save the documents ids under the position and word we have seen it.
fn insert_word_docid(&mut self, word: &str, id: DocumentId) -> anyhow::Result<()> { fn insert_word_docid(&mut self, word: &str, id: DocumentId) -> anyhow::Result<()> {
let word_vec = SmallVec32::from(word.as_bytes()); let word_vec = SmallVec32::from(word.as_bytes());
let ids = RoaringBitmap::from_iter(Some(id)); // if get_refresh finds the element it is assured to be at the end of the linked hash map.
let (_, lrus) = self.word_docids.insert(word_vec, ids, |old, new| old.union_with(&new)); match self.word_docids.get_refresh(&word_vec) {
Self::write_word_docids(&mut self.sorter, lrus)?; Some(old) => { old.insert(id); },
None => {
// A newly inserted element is append at the end of the linked hash map.
self.word_docids.insert(word_vec, RoaringBitmap::from_iter(Some(id)));
// If the word docids just reached it's capacity we must make sure to remove
// one element, this way next time we insert we doesn't grow the capacity.
if self.word_docids.len() == self.word_docids.capacity() {
// Removing the front element is equivalent to removing the LRU element.
let lru = self.word_docids.pop_front();
Self::write_word_docids(&mut self.sorter, lru)?;
}
}
}
Ok(()) Ok(())
} }
@ -600,7 +614,7 @@ fn main() -> anyhow::Result<()> {
let index = Index::new(&env)?; let index = Index::new(&env)?;
let num_threads = rayon::current_num_threads(); let num_threads = rayon::current_num_threads();
let arc_cache_size = opt.indexer.arc_cache_size; let linked_hash_map_size = opt.indexer.linked_hash_map_size;
let max_nb_chunks = opt.indexer.max_nb_chunks; let max_nb_chunks = opt.indexer.max_nb_chunks;
let max_memory = opt.indexer.max_memory; let max_memory = opt.indexer.max_memory;
let chunk_compression_type = compression_type_from_str(&opt.indexer.chunk_compression_type); let chunk_compression_type = compression_type_from_str(&opt.indexer.chunk_compression_type);
@ -611,7 +625,7 @@ fn main() -> anyhow::Result<()> {
.enumerate() .enumerate()
.map(|(i, rdr)| { .map(|(i, rdr)| {
Store::new( Store::new(
arc_cache_size, linked_hash_map_size,
max_nb_chunks, max_nb_chunks,
Some(max_memory), Some(max_memory),
chunk_compression_type, chunk_compression_type,