Optimise the merge functions to avoid allocations

This commit is contained in:
Clément Renault 2020-10-20 16:40:50 +02:00
parent cde8478388
commit f948a03be2
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4
4 changed files with 17 additions and 16 deletions

2
Cargo.lock generated
View File

@ -611,7 +611,7 @@ checksum = "9b919933a397b79c37e33b77bb2aa3dc8eb6e165ad809e58ff75bc7db2e34574"
[[package]] [[package]]
name = "grenad" name = "grenad"
version = "0.1.0" version = "0.1.0"
source = "git+https://github.com/Kerollmops/grenad.git?rev=00099b5#00099b58092c67f7ec492a6b37de465289f3110b" source = "git+https://github.com/Kerollmops/grenad.git?rev=c390cfe#c390cfed1dc8a26ca108ffaeb7bdd978fa4e9021"
dependencies = [ dependencies = [
"byteorder", "byteorder",
"flate2", "flate2",

View File

@ -13,7 +13,7 @@ csv = "1.1.3"
flate2 = "1.0.17" flate2 = "1.0.17"
fst = "0.4.4" fst = "0.4.4"
fxhash = "0.2.1" fxhash = "0.2.1"
grenad = { git = "https://github.com/Kerollmops/grenad.git", rev = "00099b5" } grenad = { git = "https://github.com/Kerollmops/grenad.git", rev = "c390cfe" }
heed = { version = "0.8.1", default-features = false, features = ["lmdb"] } heed = { version = "0.8.1", default-features = false, features = ["lmdb"] }
human_format = "1.0.3" human_format = "1.0.3"
jemallocator = "0.3.2" jemallocator = "0.3.2"

View File

@ -1,3 +1,5 @@
use std::borrow::Cow;
use anyhow::bail; use anyhow::bail;
use bstr::ByteSlice as _; use bstr::ByteSlice as _;
use fst::IntoStreamer; use fst::IntoStreamer;
@ -9,7 +11,7 @@ const WORDS_FST_KEY: &[u8] = crate::WORDS_FST_KEY.as_bytes();
const HEADERS_KEY: &[u8] = crate::HEADERS_KEY.as_bytes(); const HEADERS_KEY: &[u8] = crate::HEADERS_KEY.as_bytes();
const DOCUMENTS_IDS_KEY: &[u8] = crate::DOCUMENTS_IDS_KEY.as_bytes(); const DOCUMENTS_IDS_KEY: &[u8] = crate::DOCUMENTS_IDS_KEY.as_bytes();
pub fn main_merge(key: &[u8], values: &[Vec<u8>]) -> anyhow::Result<Vec<u8>> { pub fn main_merge(key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result<Vec<u8>> {
match key { match key {
WORDS_FST_KEY => { WORDS_FST_KEY => {
let fsts: Vec<_> = values.iter().map(|v| fst::Set::new(v).unwrap()).collect(); let fsts: Vec<_> = values.iter().map(|v| fst::Set::new(v).unwrap()).collect();
@ -32,12 +34,12 @@ pub fn main_merge(key: &[u8], values: &[Vec<u8>]) -> anyhow::Result<Vec<u8>> {
} }
} }
pub fn word_docids_merge(_key: &[u8], values: &[Vec<u8>]) -> anyhow::Result<Vec<u8>> { pub fn word_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result<Vec<u8>> {
let (head, tail) = values.split_first().unwrap(); let (head, tail) = values.split_first().unwrap();
let mut head = RoaringBitmap::deserialize_from(head.as_slice())?; let mut head = RoaringBitmap::deserialize_from(&head[..])?;
for value in tail { for value in tail {
let bitmap = RoaringBitmap::deserialize_from(value.as_slice())?; let bitmap = RoaringBitmap::deserialize_from(&value[..])?;
head.union_with(&bitmap); head.union_with(&bitmap);
} }
@ -46,16 +48,16 @@ pub fn word_docids_merge(_key: &[u8], values: &[Vec<u8>]) -> anyhow::Result<Vec<
Ok(vec) Ok(vec)
} }
pub fn docid_word_positions_merge(key: &[u8], _values: &[Vec<u8>]) -> anyhow::Result<Vec<u8>> { pub fn docid_word_positions_merge(key: &[u8], _values: &[Cow<[u8]>]) -> anyhow::Result<Vec<u8>> {
bail!("merging docid word positions is an error ({:?})", key.as_bstr()) bail!("merging docid word positions is an error ({:?})", key.as_bstr())
} }
pub fn words_pairs_proximities_docids_merge(_key: &[u8], values: &[Vec<u8>]) -> anyhow::Result<Vec<u8>> { pub fn words_pairs_proximities_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result<Vec<u8>> {
let (head, tail) = values.split_first().unwrap(); let (head, tail) = values.split_first().unwrap();
let mut head = CboRoaringBitmapCodec::deserialize_from(head.as_slice())?; let mut head = CboRoaringBitmapCodec::deserialize_from(&head[..])?;
for value in tail { for value in tail {
let bitmap = CboRoaringBitmapCodec::deserialize_from(value.as_slice())?; let bitmap = CboRoaringBitmapCodec::deserialize_from(&value[..])?;
head.union_with(&bitmap); head.union_with(&bitmap);
} }
@ -64,6 +66,6 @@ pub fn words_pairs_proximities_docids_merge(_key: &[u8], values: &[Vec<u8>]) ->
Ok(vec) Ok(vec)
} }
pub fn documents_merge(key: &[u8], _values: &[Vec<u8>]) -> anyhow::Result<Vec<u8>> { pub fn documents_merge(key: &[u8], _values: &[Cow<[u8]>]) -> anyhow::Result<Vec<u8>> {
bail!("merging documents is an error ({:?})", key.as_bstr()) bail!("merging documents is an error ({:?})", key.as_bstr())
} }

View File

@ -1,3 +1,4 @@
use std::borrow::Cow;
use std::fs::File; use std::fs::File;
use std::io::{self, Read, Seek, SeekFrom}; use std::io::{self, Read, Seek, SeekFrom};
use std::sync::mpsc::sync_channel; use std::sync::mpsc::sync_channel;
@ -81,7 +82,7 @@ enum WriteMethod {
GetMergePut, GetMergePut,
} }
type MergeFn = fn(&[u8], &[Vec<u8>]) -> anyhow::Result<Vec<u8>>; type MergeFn = for<'a> fn(&[u8], &[Cow<'a, [u8]>]) -> anyhow::Result<Vec<u8>>;
fn create_writer(typ: CompressionType, level: Option<u32>, file: File) -> io::Result<Writer<File>> { fn create_writer(typ: CompressionType, level: Option<u32>, file: File) -> io::Result<Writer<File>> {
let mut builder = Writer::builder(); let mut builder = Writer::builder();
@ -159,8 +160,7 @@ fn merge_into_lmdb_database(
while let Some((k, v)) = in_iter.next()? { while let Some((k, v)) = in_iter.next()? {
match database.get::<_, ByteSlice, ByteSlice>(wtxn, k)? { match database.get::<_, ByteSlice, ByteSlice>(wtxn, k)? {
Some(old_val) => { Some(old_val) => {
// TODO improve the function signature and avoid allocating here! let vals = vec![Cow::Borrowed(old_val), Cow::Borrowed(v)];
let vals = vec![old_val.to_vec(), v.to_vec()];
let val = merge(k, &vals).expect("merge failed"); let val = merge(k, &vals).expect("merge failed");
database.put::<_, ByteSlice, ByteSlice>(wtxn, k, &val)? database.put::<_, ByteSlice, ByteSlice>(wtxn, k, &val)?
}, },
@ -195,8 +195,7 @@ fn write_into_lmdb_database(
while let Some((k, v)) = reader.next()? { while let Some((k, v)) = reader.next()? {
match database.get::<_, ByteSlice, ByteSlice>(wtxn, k)? { match database.get::<_, ByteSlice, ByteSlice>(wtxn, k)? {
Some(old_val) => { Some(old_val) => {
// TODO improve the function signature and avoid alocating here! let vals = vec![Cow::Borrowed(old_val), Cow::Borrowed(v)];
let vals = vec![old_val.to_vec(), v.to_vec()];
let val = merge(k, &vals).expect("merge failed"); let val = merge(k, &vals).expect("merge failed");
database.put::<_, ByteSlice, ByteSlice>(wtxn, k, &val)? database.put::<_, ByteSlice, ByteSlice>(wtxn, k, &val)?
}, },