Implement a memory dumper

It moves the in memory HashMaps used when indexing to a disk based MTBL file
2024-11-22 18:17:39 +08:00 · 2020-07-07 16:48:49 +02:00 · 2020-07-07 16:48:49 +02:00 · 11c7fef80a
commit 11c7fef80a
parent b12bfcb03b
3 changed files with 81 additions and 12 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -1483,7 +1483,7 @@ dependencies = [
 [[package]]
 name = "roaring"
 version = "0.6.0"
-source = "git+https://github.com/Kerollmops/roaring-rs.git?branch=deserialize-from-slice#24420bb9f980749476cec860ea8dd3c1683c0cd1"
+source = "git+https://github.com/Kerollmops/roaring-rs.git?branch=mem-usage#a71692552902019751ef5b0e57336f030045a76a"
 dependencies = [
 "byteorder",
 ]
--- a/Cargo.toml
+++ b/Cargo.toml
@ -20,7 +20,7 @@ memmap = "0.7.0"
 once_cell = "1.4.0"
 oxidized-mtbl = { git = "https://github.com/Kerollmops/oxidized-mtbl.git", rev = "9451be8" }
 rayon = "1.3.1"
-roaring = { git = "https://github.com/Kerollmops/roaring-rs.git", branch = "deserialize-from-slice" }
+roaring = { git = "https://github.com/Kerollmops/roaring-rs.git", branch = "mem-usage" }
 slice-group-by = "0.2.6"
 smallstr = "0.2.0"
 smallvec = "1.4.0"
--- a/src/bin/indexer.rs
+++ b/src/bin/indexer.rs
@ -2,6 +2,7 @@ use std::collections::hash_map::Entry;
 use std::collections::{HashMap, BTreeSet};
 use std::convert::{TryFrom, TryInto};
 use std::fs::File;
 use std::mem;
 use std::path::PathBuf;
 use std::time::Instant;
@ -45,6 +46,14 @@ struct Opt {
    #[structopt(short, long)]
    jobs: Option<usize>,
    /// Maximum number of bytes to allocate, will be divided by the number of
    /// cores used. It is recommended to set a maximum of half of the available memory
    /// as the current measurement method is really bad.
    ///
    /// The minumum amount of memory used will be 50MB anyway.
    #[structopt(long, default_value = "4294967296")]
    max_memory_usage: usize,
    /// CSV file to index, if unspecified the CSV is read from standard input.
    csv_file: Option<PathBuf>,
 }
@ -57,6 +66,21 @@ struct Indexed {
    documents: Vec<(DocumentId, Vec<u8>)>,
 }
 impl Indexed {
    fn new(
        word_positions: FastMap4<SmallVec32<u8>, RoaringBitmap>,
        word_position_docids: FastMap4<(SmallVec32<u8>, Position), RoaringBitmap>,
        headers: Vec<u8>,
        documents: Vec<(DocumentId, Vec<u8>)>,
    ) -> anyhow::Result<Indexed>
    {
        // We store the words from the postings.
        let new_words: BTreeSet<_> = word_position_docids.iter().map(|((w, _), _)| w).collect();
        let fst = fst::Set::from_iter(new_words)?;
        Ok(Indexed { fst, headers, word_positions, word_position_docids, documents })
    }
 }
 #[derive(Default)]
 struct MtblKvStore(Option<File>);
@ -175,6 +199,7 @@ impl MtblKvStore {
    where F: FnMut(&[u8], &[u8]) -> anyhow::Result<()>
    {
        eprintln!("Merging {} MTBL stores...", stores.len());
        let before = Instant::now();
        let mmaps: Vec<_> = stores.iter().flat_map(|m| {
            m.0.as_ref().map(|f| unsafe { memmap::Mmap::map(f).unwrap() })
@ -192,20 +217,49 @@ impl MtblKvStore {
            (f)(k, v)?;
        }
-        eprintln!("MTBL stores merged!");
+        eprintln!("MTBL stores merged in {:.02?}!", before.elapsed());
        Ok(())
    }
 }
 fn mem_usage(
    word_positions: &FastMap4<SmallVec32<u8>, RoaringBitmap>,
    word_position_docids: &FastMap4<(SmallVec32<u8>, Position), RoaringBitmap>,
    documents: &Vec<(u32, Vec<u8>)>,
 ) -> usize
 {
    use std::mem::size_of;
    let documents =
          documents.iter().map(|(_, d)| d.capacity()).sum::<usize>()
        + documents.capacity() * size_of::<(Position, Vec<u8>)>();
    let word_positions =
          word_positions.iter().map(|(k, r)| {
            (if k.spilled() { k.capacity() } else { 0 }) + r.mem_usage()
          }).sum::<usize>()
        + word_positions.capacity() * size_of::<(SmallVec32<u8>, RoaringBitmap)>();
    let word_position_docids =
          word_position_docids.iter().map(|((k, _), r)| {
            (if k.spilled() { k.capacity() } else { 0 }) + r.mem_usage()
          }).sum::<usize>()
        + word_position_docids.capacity() * size_of::<((SmallVec32<u8>, Position), RoaringBitmap)>();
    documents + word_positions + word_position_docids
 }
 fn index_csv(
    mut rdr: csv::Reader<File>,
    thread_index: usize,
    num_threads: usize,
    max_mem_usage: usize,
 ) -> anyhow::Result<Vec<MtblKvStore>>
 {
    eprintln!("{:?}: Indexing into an Indexed...", thread_index);
-    let mut document = csv::StringRecord::new();
+    let mut stores = Vec::new();
    let mut word_positions = FastMap4::default();
    let mut word_position_docids = FastMap4::default();
    let mut documents = Vec::new();
@ -217,6 +271,7 @@ fn index_csv(
    let headers = writer.into_inner()?;
    let mut document_id: usize = 0;
    let mut document = csv::StringRecord::new();
    while rdr.read_record(&mut document)? {
        document_id = document_id + 1;
@ -251,16 +306,28 @@ fn index_csv(
        writer.write_byte_record(document.as_byte_record())?;
        let document = writer.into_inner()?;
        documents.push((document_id, document));
        if documents.len() % 100_000 == 0 {
            let usage = mem_usage(&word_positions, &word_position_docids, &documents);
            if usage > max_mem_usage {
                eprintln!("Whoops too much memory used ({}B).", usage);
                let word_positions = mem::take(&mut word_positions);
                let word_position_docids = mem::take(&mut word_position_docids);
                let documents = mem::take(&mut documents);
                let indexed = Indexed::new(word_positions, word_position_docids, headers.clone(), documents)?;
                eprintln!("{:?}: Indexed created!", thread_index);
                stores.push(MtblKvStore::from_indexed(indexed)?);
            }
        }
    }
-    // We store the words from the postings.
+    let indexed = Indexed::new(word_positions, word_position_docids, headers, documents)?;
    let new_words: BTreeSet<_> = word_position_docids.iter().map(|((w, _), _)| w).collect();
    let fst = fst::Set::from_iter(new_words)?;
    let indexed = Indexed { fst, headers, word_positions, word_position_docids, documents };
    eprintln!("{:?}: Indexed created!", thread_index);
    stores.push(MtblKvStore::from_indexed(indexed)?);
-    MtblKvStore::from_indexed(indexed).map(|x| vec![x])
+    Ok(stores)
 }
 // TODO merge with the previous values
@ -362,15 +429,17 @@ fn main() -> anyhow::Result<()> {
    let index = Index::new(&env)?;
    // We duplicate the file # CPU times
    let num_threads = rayon::current_num_threads();
    let max_memory_usage = (opt.max_memory_usage / num_threads).max(50 * 1024 * 1024); // 50MB
    // We duplicate the file # jobs times.
    let file = opt.csv_file.unwrap();
    let csv_readers: Vec<_> = (0..num_threads).map(|_| csv::Reader::from_path(&file)).collect::<Result<_, _>>()?;
    let stores: Vec<_> = csv_readers
        .into_par_iter()
        .enumerate()
-        .map(|(i, rdr)| index_csv(rdr, i, num_threads))
+        .map(|(i, rdr)| index_csv(rdr, i, num_threads, max_memory_usage))
        .collect::<Result<_, _>>()?;
    let stores: Vec<_> = stores.into_iter().flatten().collect();