From 3fe497e129bc99e346e1879cfcf95e46d383d803 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sat, 29 Aug 2020 11:20:39 +0200 Subject: [PATCH] Improve the Mtbl heed codec to only encode MTBL databases --- Cargo.lock | 2 +- Cargo.toml | 2 +- src/bin/indexer.rs | 5 ++++- src/heed_codec/mtbl_codec.rs | 11 ++++++----- src/lib.rs | 9 +++++---- 5 files changed, 17 insertions(+), 12 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 53546e3b1..447318d74 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1238,7 +1238,7 @@ checksum = "2839e79665f131bdb5782e51f2c6c9599c133c6098982a54c794358bf432529c" [[package]] name = "oxidized-mtbl" version = "0.1.0" -source = "git+https://github.com/Kerollmops/oxidized-mtbl.git?rev=13294cc#13294ccd73c9d6f71645a3ed2852656f3c86d31d" +source = "git+https://github.com/Kerollmops/oxidized-mtbl.git?rev=4ca66e5#4ca66e50115da760f602e878943af59f06c53af1" dependencies = [ "byteorder", "crc32c", diff --git a/Cargo.toml b/Cargo.toml index e86eac185..50683e7cb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,7 +22,7 @@ jemallocator = "0.3.2" levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] } memmap = "0.7.0" once_cell = "1.4.0" -oxidized-mtbl = { git = "https://github.com/Kerollmops/oxidized-mtbl.git", rev = "13294cc" } +oxidized-mtbl = { git = "https://github.com/Kerollmops/oxidized-mtbl.git", rev = "4ca66e5" } rayon = "1.3.1" ringtail = "0.3.0" roaring = { git = "https://github.com/Kerollmops/roaring-rs.git", branch = "mem-usage" } diff --git a/src/bin/indexer.rs b/src/bin/indexer.rs index 708c57b4c..0dd7ef074 100644 --- a/src/bin/indexer.rs +++ b/src/bin/indexer.rs @@ -579,13 +579,16 @@ fn main() -> anyhow::Result<()> { builder.extend(docs_stores); builder.build().write_into(&mut writer)?; let file = writer.into_inner()?; + + // Read back the documents MTBL database from the file. let documents_mmap = unsafe { memmap::Mmap::map(&file)? }; + let documents = Reader::new(documents_mmap)?; debug!("We are writing the postings lists and documents into LMDB on disk..."); // We merge the postings lists into LMDB. let mut wtxn = env.write_txn()?; merge_into_lmdb(stores, |k, v| lmdb_writer(&mut wtxn, &index, k, v))?; - index.put_documents(&mut wtxn, &documents_mmap)?; + index.put_documents(&mut wtxn, &documents)?; let count = index.number_of_documents(&wtxn)?; wtxn.commit()?; diff --git a/src/heed_codec/mtbl_codec.rs b/src/heed_codec/mtbl_codec.rs index c36960079..b4815da4f 100644 --- a/src/heed_codec/mtbl_codec.rs +++ b/src/heed_codec/mtbl_codec.rs @@ -1,9 +1,10 @@ use std::borrow::Cow; +use std::marker::PhantomData; use oxidized_mtbl::Reader; -pub struct MtblCodec; +pub struct MtblCodec(PhantomData); -impl<'a> heed::BytesDecode<'a> for MtblCodec { +impl<'a> heed::BytesDecode<'a> for MtblCodec<&'a [u8]> { type DItem = Reader<&'a [u8]>; fn bytes_decode(bytes: &'a [u8]) -> Option { @@ -11,10 +12,10 @@ impl<'a> heed::BytesDecode<'a> for MtblCodec { } } -impl heed::BytesEncode<'_> for MtblCodec { - type EItem = [u8]; +impl<'a, A: AsRef<[u8]> + 'a> heed::BytesEncode<'a> for MtblCodec { + type EItem = Reader; fn bytes_encode(item: &Self::EItem) -> Option> { - Some(Cow::Borrowed(item)) + Some(Cow::Borrowed(item.as_bytes())) } } diff --git a/src/lib.rs b/src/lib.rs index bcfc3d2bb..65b8f0534 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -12,6 +12,7 @@ use anyhow::{bail, Context}; use fxhash::{FxHasher32, FxHasher64}; use heed::types::*; use heed::{PolyDatabase, Database}; +use oxidized_mtbl as omtbl; pub use self::search::{Search, SearchResult}; pub use self::criterion::{Criterion, default_criteria}; @@ -90,7 +91,7 @@ impl Index { iter: impl IntoIterator, ) -> anyhow::Result)>> { - match self.main.get::<_, Str, MtblCodec>(rtxn, DOCUMENTS_KEY)? { + match self.main.get::<_, Str, MtblCodec<&[u8]>>(rtxn, DOCUMENTS_KEY)? { Some(documents) => { iter.into_iter().map(|id| { let key = id.to_be_bytes(); @@ -103,13 +104,13 @@ impl Index { } } - pub fn put_documents(&self, wtxn: &mut heed::RwTxn, documents: &[u8]) -> anyhow::Result<()> { - Ok(self.main.put::<_, Str, MtblCodec>(wtxn, DOCUMENTS_KEY, documents)?) + pub fn put_documents>(&self, wtxn: &mut heed::RwTxn, documents: &omtbl::Reader) -> anyhow::Result<()> { + Ok(self.main.put::<_, Str, MtblCodec>(wtxn, DOCUMENTS_KEY, documents)?) } /// Returns the number of documents indexed in the database. pub fn number_of_documents<'t>(&self, rtxn: &'t heed::RoTxn) -> anyhow::Result { - match self.main.get::<_, Str, MtblCodec>(rtxn, DOCUMENTS_KEY)? { + match self.main.get::<_, Str, MtblCodec<&[u8]>>(rtxn, DOCUMENTS_KEY)? { Some(documents) => Ok(documents.metadata().count_entries as usize), None => return Ok(0), }