Put the documents MTBL back into LMDB

We makes sure to write the documents into a file before
memory mapping it and putting it into LMDB, this way we avoid
moving it to RAM
This commit is contained in:
Clément Renault 2020-08-28 15:38:05 +02:00
parent d784d87880
commit 0a44ff86ab
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4
10 changed files with 100 additions and 110 deletions

View File

@ -63,8 +63,5 @@ $('#docs-count').text(function(index, text) {
// Make the database a little bit easier to read // Make the database a little bit easier to read
$('#db-size').text(function(index, text) { $('#db-size').text(function(index, text) {
let arr = text.split("+"); return filesize(parseInt(text))
let database_size = filesize(parseInt(arr[0]));
let documents_size = filesize(parseInt(arr[1]));
return `${database_size} + ${documents_size}`
}); });

View File

@ -1,5 +1,5 @@
use std::convert::{TryFrom, TryInto}; use std::convert::{TryFrom, TryInto};
use std::fs::{File, OpenOptions}; use std::fs::File;
use std::io::{self, Read, Write}; use std::io::{self, Read, Write};
use std::iter::FromIterator; use std::iter::FromIterator;
use std::path::PathBuf; use std::path::PathBuf;
@ -14,7 +14,7 @@ use flate2::read::GzDecoder;
use fst::IntoStreamer; use fst::IntoStreamer;
use heed::EnvOpenOptions; use heed::EnvOpenOptions;
use heed::types::*; use heed::types::*;
use log::debug; use log::{debug, info};
use memmap::Mmap; use memmap::Mmap;
use oxidized_mtbl::{Reader, Writer, Merger, Sorter, CompressionType}; use oxidized_mtbl::{Reader, Writer, Merger, Sorter, CompressionType};
use rayon::prelude::*; use rayon::prelude::*;
@ -486,9 +486,9 @@ fn main() -> anyhow::Result<()> {
.max_dbs(10) .max_dbs(10)
.open(&opt.database)?; .open(&opt.database)?;
let mut index = Index::new(&env, &opt.database)?; let before_indexing = Instant::now();
let index = Index::new(&env)?;
let documents_path = opt.database.join("documents.mtbl");
let num_threads = rayon::current_num_threads(); let num_threads = rayon::current_num_threads();
let arc_cache_size = opt.indexer.arc_cache_size; let arc_cache_size = opt.indexer.arc_cache_size;
let max_nb_chunks = opt.indexer.max_nb_chunks; let max_nb_chunks = opt.indexer.max_nb_chunks;
@ -566,17 +566,9 @@ fn main() -> anyhow::Result<()> {
docs_stores.push(d); docs_stores.push(d);
}); });
debug!("We are writing into LMDB and MTBL..."); debug!("We are writing the documents into MTBL on disk...");
// We run both merging steps in parallel.
let (lmdb, mtbl) = rayon::join(|| {
// We merge the postings lists into LMDB.
let mut wtxn = env.write_txn()?;
merge_into_lmdb(stores, |k, v| lmdb_writer(&mut wtxn, &index, k, v))?;
Ok(wtxn.commit()?) as anyhow::Result<_>
}, || {
// We also merge the documents into its own MTBL store. // We also merge the documents into its own MTBL store.
let file = OpenOptions::new().create(true).truncate(true).write(true).read(true).open(documents_path)?; let file = tempfile::tempfile()?;
let mut writer = Writer::builder() let mut writer = Writer::builder()
.compression_type(documents_compression_type) .compression_type(documents_compression_type)
.compression_level(documents_compression_level) .compression_level(documents_compression_level)
@ -584,14 +576,18 @@ fn main() -> anyhow::Result<()> {
let mut builder = Merger::builder(docs_merge); let mut builder = Merger::builder(docs_merge);
builder.extend(docs_stores); builder.extend(docs_stores);
builder.build().write_into(&mut writer)?; builder.build().write_into(&mut writer)?;
Ok(writer.finish()?) as anyhow::Result<_> let file = writer.into_inner()?;
}); let documents_mmap = unsafe { memmap::Mmap::map(&file)? };
lmdb.and(mtbl)?; debug!("We are writing the postings lists and documents into LMDB on disk...");
index.refresh_documents()?; // We merge the postings lists into LMDB.
let count = index.number_of_documents(); let mut wtxn = env.write_txn()?;
merge_into_lmdb(stores, |k, v| lmdb_writer(&mut wtxn, &index, k, v))?;
index.put_documents(&mut wtxn, &documents_mmap)?;
let count = index.number_of_documents(&wtxn)?;
wtxn.commit()?;
debug!("Wrote {} documents into LMDB", count); info!("Wrote {} documents in {:.02?}", count, before_indexing.elapsed());
Ok(()) Ok(())
} }

View File

@ -96,7 +96,7 @@ fn main() -> anyhow::Result<()> {
.open(&opt.database)?; .open(&opt.database)?;
// Open the LMDB database. // Open the LMDB database.
let index = Index::new(&env, opt.database)?; let index = Index::new(&env)?;
let rtxn = env.read_txn()?; let rtxn = env.read_txn()?;
match opt.command { match opt.command {
@ -200,6 +200,11 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
if heap.len() > limit { heap.pop(); } if heap.len() > limit { heap.pop(); }
} }
if let Some(documents) = index.main.get::<_, ByteSlice, ByteSlice>(rtxn, b"documents")? {
heap.push(Reverse((documents.len(), format!("documents"), main_name)));
if heap.len() > limit { heap.pop(); }
}
for result in index.word_positions.as_polymorph().iter::<_, Str, ByteSlice>(rtxn)? { for result in index.word_positions.as_polymorph().iter::<_, Str, ByteSlice>(rtxn)? {
let (word, value) = result?; let (word, value) = result?;
heap.push(Reverse((value.len(), word.to_string(), word_positions_name))); heap.push(Reverse((value.len(), word.to_string(), word_positions_name)));

View File

@ -49,7 +49,7 @@ fn main() -> anyhow::Result<()> {
.open(&opt.database)?; .open(&opt.database)?;
// Open the LMDB database. // Open the LMDB database.
let index = Index::new(&env, opt.database)?; let index = Index::new(&env)?;
let rtxn = env.read_txn()?; let rtxn = env.read_txn()?;
let stdin = io::stdin(); let stdin = io::stdin();
@ -68,7 +68,7 @@ fn main() -> anyhow::Result<()> {
Some(headers) => headers, Some(headers) => headers,
None => return Ok(()), None => return Ok(()),
}; };
let documents = index.documents(result.documents_ids.iter().cloned())?; let documents = index.documents(&rtxn, result.documents_ids.iter().cloned())?;
let mut stdout = io::stdout(); let mut stdout = io::stdout();
stdout.write_all(&headers)?; stdout.write_all(&headers)?;

View File

@ -62,7 +62,6 @@ fn highlight_string(string: &str, words: &HashSet<String>) -> String {
struct IndexTemplate { struct IndexTemplate {
db_name: String, db_name: String,
db_size: usize, db_size: usize,
docs_size: usize,
docs_count: usize, docs_count: usize,
} }
@ -83,28 +82,23 @@ async fn main() -> anyhow::Result<()> {
.open(&opt.database)?; .open(&opt.database)?;
// Open the LMDB database. // Open the LMDB database.
let index = Index::new(&env, &opt.database)?; let index = Index::new(&env)?;
// Retrieve the database the file stem (w/o the extension), // Retrieve the database the file stem (w/o the extension),
// the disk file size and the number of documents in the database. // the disk file size and the number of documents in the database.
let db_name = opt.database.file_stem().and_then(|s| s.to_str()).unwrap_or("").to_string(); let db_name = opt.database.file_stem().and_then(|s| s.to_str()).unwrap_or("").to_string();
let db_size = File::open(opt.database.join("data.mdb"))?.metadata()?.len() as usize; let db_size = File::open(opt.database.join("data.mdb"))?.metadata()?.len() as usize;
let docs_size = File::open(opt.database.join("documents.mtbl"))?.metadata()?.len() as usize;
let docs_count = index.number_of_documents(); let rtxn = env.read_txn()?;
let docs_count = index.number_of_documents(&rtxn)? as usize;
drop(rtxn);
// We run and wait on the HTTP server // We run and wait on the HTTP server
// Expose an HTML page to debug the search in a browser // Expose an HTML page to debug the search in a browser
let dash_html_route = warp::filters::method::get() let dash_html_route = warp::filters::method::get()
.and(warp::filters::path::end()) .and(warp::filters::path::end())
.map(move || { .map(move || IndexTemplate { db_name: db_name.clone(), db_size, docs_count });
IndexTemplate {
db_name: db_name.clone(),
db_size,
docs_size,
docs_count: docs_count as usize,
}
});
let dash_bulma_route = warp::filters::method::get() let dash_bulma_route = warp::filters::method::get()
.and(warp::path!("bulma.min.css")) .and(warp::path!("bulma.min.css"))
@ -192,7 +186,7 @@ async fn main() -> anyhow::Result<()> {
if let Some(headers) = index.headers(&rtxn).unwrap() { if let Some(headers) = index.headers(&rtxn).unwrap() {
// We write the headers // We write the headers
body.extend_from_slice(headers); body.extend_from_slice(headers);
let documents = index.documents(documents_ids).unwrap(); let documents = index.documents(&rtxn, documents_ids).unwrap();
for (_id, content) in documents { for (_id, content) in documents {
let content = std::str::from_utf8(content.as_ref()).unwrap(); let content = std::str::from_utf8(content.as_ref()).unwrap();

View File

@ -1,5 +1,7 @@
mod mtbl_codec;
mod roaring_bitmap_codec; mod roaring_bitmap_codec;
mod str_beu32_codec; mod str_beu32_codec;
pub use self::mtbl_codec::MtblCodec;
pub use self::roaring_bitmap_codec::RoaringBitmapCodec; pub use self::roaring_bitmap_codec::RoaringBitmapCodec;
pub use self::str_beu32_codec::StrBEU32Codec; pub use self::str_beu32_codec::StrBEU32Codec;

View File

@ -0,0 +1,20 @@
use std::borrow::Cow;
use oxidized_mtbl::Reader;
pub struct MtblCodec;
impl<'a> heed::BytesDecode<'a> for MtblCodec {
type DItem = Reader<&'a [u8]>;
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
Reader::new(bytes).ok()
}
}
impl heed::BytesEncode<'_> for MtblCodec {
type EItem = [u8];
fn bytes_encode(item: &Self::EItem) -> Option<Cow<[u8]>> {
Some(Cow::Borrowed(item))
}
}

View File

@ -2,27 +2,20 @@ mod criterion;
mod node; mod node;
mod query_tokens; mod query_tokens;
mod search; mod search;
mod transitive_arc;
pub mod heed_codec; pub mod heed_codec;
pub mod lexer; pub mod lexer;
use std::collections::HashMap; use std::collections::HashMap;
use std::fs::{File, OpenOptions};
use std::hash::BuildHasherDefault; use std::hash::BuildHasherDefault;
use std::path::{Path, PathBuf};
use std::sync::Arc;
use anyhow::Context; use anyhow::{bail, Context};
use fxhash::{FxHasher32, FxHasher64}; use fxhash::{FxHasher32, FxHasher64};
use heed::types::*; use heed::types::*;
use heed::{PolyDatabase, Database}; use heed::{PolyDatabase, Database};
use memmap::Mmap;
use oxidized_mtbl as omtbl;
pub use self::search::{Search, SearchResult}; pub use self::search::{Search, SearchResult};
pub use self::criterion::{Criterion, default_criteria}; pub use self::criterion::{Criterion, default_criteria};
use self::heed_codec::{RoaringBitmapCodec, StrBEU32Codec}; use self::heed_codec::{MtblCodec, RoaringBitmapCodec, StrBEU32Codec};
use self::transitive_arc::TransitiveArc;
pub type FastMap4<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher32>>; pub type FastMap4<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher32>>;
pub type FastMap8<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher64>>; pub type FastMap8<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher64>>;
@ -34,10 +27,12 @@ pub type DocumentId = u32;
pub type Attribute = u32; pub type Attribute = u32;
pub type Position = u32; pub type Position = u32;
const WORDS_FST_KEY: &str = "words-fst";
const HEADERS_KEY: &str = "headers";
const DOCUMENTS_KEY: &str = "documents";
#[derive(Clone)] #[derive(Clone)]
pub struct Index { pub struct Index {
// The database path, where the LMDB and MTBL files are.
path: PathBuf,
/// Contains many different types (e.g. the documents CSV headers). /// Contains many different types (e.g. the documents CSV headers).
pub main: PolyDatabase, pub main: PolyDatabase,
/// A word and all the positions where it appears in the whole dataset. /// A word and all the positions where it appears in the whole dataset.
@ -46,44 +41,24 @@ pub struct Index {
pub word_position_docids: Database<StrBEU32Codec, RoaringBitmapCodec>, pub word_position_docids: Database<StrBEU32Codec, RoaringBitmapCodec>,
/// Maps a word and an attribute (u32) to all the documents ids where the given word appears. /// Maps a word and an attribute (u32) to all the documents ids where the given word appears.
pub word_attribute_docids: Database<StrBEU32Codec, RoaringBitmapCodec>, pub word_attribute_docids: Database<StrBEU32Codec, RoaringBitmapCodec>,
/// The MTBL store that contains the documents content.
documents: omtbl::Reader<TransitiveArc<Mmap>>,
} }
impl Index { impl Index {
pub fn new<P: AsRef<Path>>(env: &heed::Env, path: P) -> anyhow::Result<Index> { pub fn new(env: &heed::Env) -> anyhow::Result<Index> {
let documents_path = path.as_ref().join("documents.mtbl");
let mut documents = OpenOptions::new().create(true).write(true).read(true).open(documents_path)?;
// If the file is empty we must initialize it like an empty MTBL database.
if documents.metadata()?.len() == 0 {
omtbl::Writer::new(&mut documents).finish()?;
}
let documents = unsafe { memmap::Mmap::map(&documents)? };
Ok(Index { Ok(Index {
path: path.as_ref().to_path_buf(),
main: env.create_poly_database(None)?, main: env.create_poly_database(None)?,
word_positions: env.create_database(Some("word-positions"))?, word_positions: env.create_database(Some("word-positions"))?,
word_position_docids: env.create_database(Some("word-position-docids"))?, word_position_docids: env.create_database(Some("word-position-docids"))?,
word_attribute_docids: env.create_database(Some("word-attribute-docids"))?, word_attribute_docids: env.create_database(Some("word-attribute-docids"))?,
documents: omtbl::Reader::new(TransitiveArc(Arc::new(documents)))?,
}) })
} }
pub fn refresh_documents(&mut self) -> anyhow::Result<()> {
let documents_path = self.path.join("documents.mtbl");
let documents = File::open(&documents_path)?;
let documents = unsafe { memmap::Mmap::map(&documents)? };
self.documents = omtbl::Reader::new(TransitiveArc(Arc::new(documents)))?;
Ok(())
}
pub fn put_headers(&self, wtxn: &mut heed::RwTxn, headers: &[u8]) -> anyhow::Result<()> { pub fn put_headers(&self, wtxn: &mut heed::RwTxn, headers: &[u8]) -> anyhow::Result<()> {
Ok(self.main.put::<_, Str, ByteSlice>(wtxn, "headers", headers)?) Ok(self.main.put::<_, Str, ByteSlice>(wtxn, HEADERS_KEY, headers)?)
} }
pub fn headers<'t>(&self, rtxn: &'t heed::RoTxn) -> heed::Result<Option<&'t [u8]>> { pub fn headers<'t>(&self, rtxn: &'t heed::RoTxn) -> heed::Result<Option<&'t [u8]>> {
self.main.get::<_, Str, ByteSlice>(rtxn, "headers") self.main.get::<_, Str, ByteSlice>(rtxn, HEADERS_KEY)
} }
pub fn number_of_attributes<'t>(&self, rtxn: &'t heed::RoTxn) -> anyhow::Result<Option<usize>> { pub fn number_of_attributes<'t>(&self, rtxn: &'t heed::RoTxn) -> anyhow::Result<Option<usize>> {
@ -98,29 +73,46 @@ impl Index {
} }
pub fn put_fst<A: AsRef<[u8]>>(&self, wtxn: &mut heed::RwTxn, fst: &fst::Set<A>) -> anyhow::Result<()> { pub fn put_fst<A: AsRef<[u8]>>(&self, wtxn: &mut heed::RwTxn, fst: &fst::Set<A>) -> anyhow::Result<()> {
Ok(self.main.put::<_, Str, ByteSlice>(wtxn, "words-fst", fst.as_fst().as_bytes())?) Ok(self.main.put::<_, Str, ByteSlice>(wtxn, WORDS_FST_KEY, fst.as_fst().as_bytes())?)
} }
pub fn fst<'t>(&self, rtxn: &'t heed::RoTxn) -> anyhow::Result<Option<fst::Set<&'t [u8]>>> { pub fn fst<'t>(&self, rtxn: &'t heed::RoTxn) -> anyhow::Result<Option<fst::Set<&'t [u8]>>> {
match self.main.get::<_, Str, ByteSlice>(rtxn, "words-fst")? { match self.main.get::<_, Str, ByteSlice>(rtxn, WORDS_FST_KEY)? {
Some(bytes) => Ok(Some(fst::Set::new(bytes)?)), Some(bytes) => Ok(Some(fst::Set::new(bytes)?)),
None => Ok(None), None => Ok(None),
} }
} }
/// Returns a [`Vec`] of the requested documents. Returns an error if a document is missing. /// Returns a [`Vec`] of the requested documents. Returns an error if a document is missing.
pub fn documents<I: IntoIterator<Item=DocumentId>>(&self, iter: I) -> anyhow::Result<Vec<(DocumentId, Vec<u8>)>> { pub fn documents<'t>(
&self,
rtxn: &'t heed::RoTxn,
iter: impl IntoIterator<Item=DocumentId>,
) -> anyhow::Result<Vec<(DocumentId, Vec<u8>)>>
{
match self.main.get::<_, Str, MtblCodec>(rtxn, DOCUMENTS_KEY)? {
Some(documents) => {
iter.into_iter().map(|id| { iter.into_iter().map(|id| {
let key = id.to_be_bytes(); let key = id.to_be_bytes();
let content = self.documents.clone().get(&key)?.with_context(|| format!("Could not find document {}.", id))?; let content = documents.clone().get(&key)?
.with_context(|| format!("Could not find document {}", id))?;
Ok((id, content.as_ref().to_vec())) Ok((id, content.as_ref().to_vec()))
}) }).collect()
.collect() },
None => bail!("No documents database found"),
}
}
pub fn put_documents(&self, wtxn: &mut heed::RwTxn, documents: &[u8]) -> anyhow::Result<()> {
Ok(self.main.put::<_, Str, MtblCodec>(wtxn, DOCUMENTS_KEY, documents)?)
} }
/// Returns the number of documents indexed in the database. /// Returns the number of documents indexed in the database.
pub fn number_of_documents(&self) -> usize { pub fn number_of_documents<'t>(&self, rtxn: &'t heed::RoTxn) -> anyhow::Result<usize> {
self.documents.metadata().count_entries as usize match self.main.get::<_, Str, MtblCodec>(rtxn, DOCUMENTS_KEY)? {
Some(documents) => Ok(documents.metadata().count_entries as usize),
None => return Ok(0),
}
} }
pub fn search<'a>(&'a self, rtxn: &'a heed::RoTxn) -> Search<'a> { pub fn search<'a>(&'a self, rtxn: &'a heed::RoTxn) -> Search<'a> {

View File

@ -1,16 +0,0 @@
use std::sync::Arc;
/// An `Arc<[u8]>` that is transitive over `AsRef<[u8]>`.
pub struct TransitiveArc<T>(pub Arc<T>);
impl<T: AsRef<[u8]>> AsRef<[u8]> for TransitiveArc<T> {
fn as_ref(&self) -> &[u8] {
self.0.as_ref().as_ref()
}
}
impl<T> Clone for TransitiveArc<T> {
fn clone(&self) -> TransitiveArc<T> {
TransitiveArc(self.0.clone())
}
}

View File

@ -34,7 +34,7 @@
<div class="level-item has-text-centered"> <div class="level-item has-text-centered">
<div> <div>
<p class="heading">Database Size</p> <p class="heading">Database Size</p>
<p class="title" id="db-size">{{ db_size }} + {{ docs_size }}</p> <p class="title" id="db-size">{{ db_size }}</p>
</div> </div>
</div> </div>
<div class="level-item has-text-centered"> <div class="level-item has-text-centered">