Directly write to LMDB without intermediate final MTBL

This commit is contained in:
Kerollmops 2020-06-01 21:09:32 +02:00
parent 2174042994
commit 1df1f88fe1
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4

View File

@ -1,5 +1,5 @@
use std::collections::BTreeSet; use std::collections::BTreeSet;
use std::convert::{TryInto, TryFrom}; use std::convert::TryFrom;
use std::fs::File; use std::fs::File;
use std::path::PathBuf; use std::path::PathBuf;
use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::atomic::{AtomicUsize, Ordering};
@ -15,7 +15,7 @@ use roaring::RoaringBitmap;
use structopt::StructOpt; use structopt::StructOpt;
use mega_mini_indexer::alphanumeric_tokens; use mega_mini_indexer::alphanumeric_tokens;
use mega_mini_indexer::{FastMap4, SmallVec32, BEU32, Index, DocumentId}; use mega_mini_indexer::{FastMap4, SmallVec32, Index, DocumentId};
#[cfg(target_os = "linux")] #[cfg(target_os = "linux")]
#[global_allocator] #[global_allocator]
@ -136,7 +136,9 @@ impl MtblKvStore {
} }
} }
fn from_many(stores: Vec<MtblKvStore>) -> anyhow::Result<MtblKvStore> { fn from_many<F>(stores: Vec<MtblKvStore>, mut f: F) -> anyhow::Result<()>
where F: FnMut(&[u8], &[u8]) -> anyhow::Result<()>
{
eprintln!("{:?}: Merging {} MTBL stores...", rayon::current_thread_index(), stores.len()); eprintln!("{:?}: Merging {} MTBL stores...", rayon::current_thread_index(), stores.len());
let mmaps: Vec<_> = stores.iter().flat_map(|m| { let mmaps: Vec<_> = stores.iter().flat_map(|m| {
@ -147,21 +149,16 @@ impl MtblKvStore {
Reader::new(&mmap, ReaderOptions::default()).unwrap() Reader::new(&mmap, ReaderOptions::default()).unwrap()
}).collect(); }).collect();
let outfile = tempfile::tempfile()?;
let mut out = Writer::new(outfile, None)?;
let opt = MergerOptions { merge: MtblKvStore::merge }; let opt = MergerOptions { merge: MtblKvStore::merge };
let mut merger = Merger::new(sources, opt); let mut merger = Merger::new(sources, opt);
let mut iter = merger.iter(); let mut iter = merger.iter();
while let Some((k, v)) = iter.next() { while let Some((k, v)) = iter.next() {
out.add(k, v).unwrap(); (f)(k, v)?;
} }
let out = out.into_inner()?;
eprintln!("{:?}: MTBL stores merged!", rayon::current_thread_index()); eprintln!("{:?}: MTBL stores merged!", rayon::current_thread_index());
Ok(MtblKvStore(Some(out))) Ok(())
} }
} }
@ -226,46 +223,32 @@ fn index_csv(mut rdr: csv::Reader<File>) -> anyhow::Result<MtblKvStore> {
} }
// TODO merge with the previous values // TODO merge with the previous values
fn writer(wtxn: &mut heed::RwTxn, index: Index, mtbl_store: MtblKvStore) -> anyhow::Result<usize> { fn writer(wtxn: &mut heed::RwTxn, index: &Index, key: &[u8], val: &[u8]) -> anyhow::Result<()> {
let mtbl_store = match mtbl_store.0 { if key == b"\0words-fst" {
Some(store) => unsafe { memmap::Mmap::map(&store)? }, // Write the words fst
None => return Ok(0), index.main.put::<_, Str, ByteSlice>(wtxn, "words-fst", val)?;
}; }
let mtbl_store = Reader::new(&mtbl_store, ReaderOptions::default()).unwrap(); else if key == b"\0headers" {
// Write the headers
// Write the words fst index.main.put::<_, Str, ByteSlice>(wtxn, "headers", val)?;
let fst = mtbl_store.get(b"\0words-fst").unwrap(); }
let fst = fst::Set::new(fst)?; else if key.starts_with(&[1]) {
index.main.put::<_, Str, ByteSlice>(wtxn, "words-fst", &fst.as_fst().as_bytes())?; // Write the postings lists
index.postings_ids.as_polymorph()
// Write and merge the headers .put::<_, ByteSlice, ByteSlice>(wtxn, &key[1..], val)?;
let headers = mtbl_store.get(b"\0headers").unwrap(); }
index.main.put::<_, Str, ByteSlice>(wtxn, "headers", headers.as_ref())?; else if key.starts_with(&[2]) {
// Write the prefix postings lists
// Write and merge the postings lists index.prefix_postings_ids.as_polymorph()
let mut iter = mtbl_store.iter_prefix(&[1]).unwrap(); .put::<_, ByteSlice, ByteSlice>(wtxn, &key[1..], val)?;
while let Some((word, postings)) = iter.next() { }
let word = std::str::from_utf8(&word[1..]).unwrap(); else if key.starts_with(&[3]) {
index.postings_ids.put(wtxn, &word, &postings)?; // Write the documents
index.documents.as_polymorph()
.put::<_, ByteSlice, ByteSlice>(wtxn, &key[1..], val)?;
} }
// Write and merge the prefix postings lists Ok(())
let mut iter = mtbl_store.iter_prefix(&[2]).unwrap();
while let Some((word, postings)) = iter.next() {
let word = std::str::from_utf8(&word[1..]).unwrap();
index.prefix_postings_ids.put(wtxn, &word, &postings)?;
}
// Write the documents
let mut count = 0;
let mut iter = mtbl_store.iter_prefix(&[3]).unwrap();
while let Some((id_bytes, content)) = iter.next() {
let id = id_bytes[1..].try_into().map(u32::from_be_bytes).unwrap();
index.documents.put(wtxn, &BEU32::new(id), &content)?;
count += 1;
}
Ok(count)
} }
fn main() -> anyhow::Result<()> { fn main() -> anyhow::Result<()> {
@ -291,18 +274,24 @@ fn main() -> anyhow::Result<()> {
}) })
.collect::<Result<_, _>>()?; .collect::<Result<_, _>>()?;
while stores.len() > 1 { while stores.len() > 3 {
let chunk_size = (stores.len() / rayon::current_num_threads()).max(2);
let s = std::mem::take(&mut stores); let s = std::mem::take(&mut stores);
stores = s.into_par_iter().chunks(3) stores = s.into_par_iter().chunks(chunk_size)
.map(MtblKvStore::from_many) .map(|v| {
let outfile = tempfile::tempfile()?;
let mut out = Writer::new(outfile, None)?;
MtblKvStore::from_many(v, |k, v| Ok(out.add(k, v).unwrap()))?;
let out = out.into_inner()?;
Ok(MtblKvStore(Some(out))) as anyhow::Result<_>
})
.collect::<Result<_, _>>()?; .collect::<Result<_, _>>()?;
} }
let mtbl_store = stores.pop().unwrap_or_default();
eprintln!("We are writing into LMDB..."); eprintln!("We are writing into LMDB...");
let mut wtxn = env.write_txn()?; let mut wtxn = env.write_txn()?;
let count = writer(&mut wtxn, index, mtbl_store)?; MtblKvStore::from_many(stores, |k, v| writer(&mut wtxn, &index, k, v))?;
let count = index.documents.len(&wtxn)?;
wtxn.commit()?; wtxn.commit()?;
eprintln!("Wrote {} documents into LMDB", count); eprintln!("Wrote {} documents into LMDB", count);