From f948a03be24b464b4f5a934a705c6d122cf81627 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= <clement@meilisearch.com>
Date: Tue, 20 Oct 2020 16:40:50 +0200
Subject: [PATCH] Optimise the merge functions to avoid allocations

---
 Cargo.lock                     |  2 +-
 Cargo.toml                     |  2 +-
 src/indexing/merge_function.rs | 20 +++++++++++---------
 src/indexing/mod.rs            |  9 ++++-----
 4 files changed, 17 insertions(+), 16 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index d32dd684b..4d0084fb0 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -611,7 +611,7 @@ checksum = "9b919933a397b79c37e33b77bb2aa3dc8eb6e165ad809e58ff75bc7db2e34574"
 [[package]]
 name = "grenad"
 version = "0.1.0"
-source = "git+https://github.com/Kerollmops/grenad.git?rev=00099b5#00099b58092c67f7ec492a6b37de465289f3110b"
+source = "git+https://github.com/Kerollmops/grenad.git?rev=c390cfe#c390cfed1dc8a26ca108ffaeb7bdd978fa4e9021"
 dependencies = [
  "byteorder",
  "flate2",
diff --git a/Cargo.toml b/Cargo.toml
index a829d781d..889079f53 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -13,7 +13,7 @@ csv = "1.1.3"
 flate2 = "1.0.17"
 fst = "0.4.4"
 fxhash = "0.2.1"
-grenad = { git = "https://github.com/Kerollmops/grenad.git", rev = "00099b5" }
+grenad = { git = "https://github.com/Kerollmops/grenad.git", rev = "c390cfe" }
 heed = { version = "0.8.1", default-features = false, features = ["lmdb"] }
 human_format = "1.0.3"
 jemallocator = "0.3.2"
diff --git a/src/indexing/merge_function.rs b/src/indexing/merge_function.rs
index 29a9c9125..68ea53ac4 100644
--- a/src/indexing/merge_function.rs
+++ b/src/indexing/merge_function.rs
@@ -1,3 +1,5 @@
+use std::borrow::Cow;
+
 use anyhow::bail;
 use bstr::ByteSlice as _;
 use fst::IntoStreamer;
@@ -9,7 +11,7 @@ const WORDS_FST_KEY: &[u8] = crate::WORDS_FST_KEY.as_bytes();
 const HEADERS_KEY: &[u8] = crate::HEADERS_KEY.as_bytes();
 const DOCUMENTS_IDS_KEY: &[u8] = crate::DOCUMENTS_IDS_KEY.as_bytes();
 
-pub fn main_merge(key: &[u8], values: &[Vec<u8>]) -> anyhow::Result<Vec<u8>> {
+pub fn main_merge(key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result<Vec<u8>> {
     match key {
         WORDS_FST_KEY => {
             let fsts: Vec<_> = values.iter().map(|v| fst::Set::new(v).unwrap()).collect();
@@ -32,12 +34,12 @@ pub fn main_merge(key: &[u8], values: &[Vec<u8>]) -> anyhow::Result<Vec<u8>> {
     }
 }
 
-pub fn word_docids_merge(_key: &[u8], values: &[Vec<u8>]) -> anyhow::Result<Vec<u8>> {
+pub fn word_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result<Vec<u8>> {
     let (head, tail) = values.split_first().unwrap();
-    let mut head = RoaringBitmap::deserialize_from(head.as_slice())?;
+    let mut head = RoaringBitmap::deserialize_from(&head[..])?;
 
     for value in tail {
-        let bitmap = RoaringBitmap::deserialize_from(value.as_slice())?;
+        let bitmap = RoaringBitmap::deserialize_from(&value[..])?;
         head.union_with(&bitmap);
     }
 
@@ -46,16 +48,16 @@ pub fn word_docids_merge(_key: &[u8], values: &[Vec<u8>]) -> anyhow::Result<Vec<
     Ok(vec)
 }
 
-pub fn docid_word_positions_merge(key: &[u8], _values: &[Vec<u8>]) -> anyhow::Result<Vec<u8>> {
+pub fn docid_word_positions_merge(key: &[u8], _values: &[Cow<[u8]>]) -> anyhow::Result<Vec<u8>> {
     bail!("merging docid word positions is an error ({:?})", key.as_bstr())
 }
 
-pub fn words_pairs_proximities_docids_merge(_key: &[u8], values: &[Vec<u8>]) -> anyhow::Result<Vec<u8>> {
+pub fn words_pairs_proximities_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result<Vec<u8>> {
     let (head, tail) = values.split_first().unwrap();
-    let mut head = CboRoaringBitmapCodec::deserialize_from(head.as_slice())?;
+    let mut head = CboRoaringBitmapCodec::deserialize_from(&head[..])?;
 
     for value in tail {
-        let bitmap = CboRoaringBitmapCodec::deserialize_from(value.as_slice())?;
+        let bitmap = CboRoaringBitmapCodec::deserialize_from(&value[..])?;
         head.union_with(&bitmap);
     }
 
@@ -64,6 +66,6 @@ pub fn words_pairs_proximities_docids_merge(_key: &[u8], values: &[Vec<u8>]) ->
     Ok(vec)
 }
 
-pub fn documents_merge(key: &[u8], _values: &[Vec<u8>]) -> anyhow::Result<Vec<u8>> {
+pub fn documents_merge(key: &[u8], _values: &[Cow<[u8]>]) -> anyhow::Result<Vec<u8>> {
     bail!("merging documents is an error ({:?})", key.as_bstr())
 }
diff --git a/src/indexing/mod.rs b/src/indexing/mod.rs
index 14bfae991..5e7293bcd 100644
--- a/src/indexing/mod.rs
+++ b/src/indexing/mod.rs
@@ -1,3 +1,4 @@
+use std::borrow::Cow;
 use std::fs::File;
 use std::io::{self, Read, Seek, SeekFrom};
 use std::sync::mpsc::sync_channel;
@@ -81,7 +82,7 @@ enum WriteMethod {
     GetMergePut,
 }
 
-type MergeFn = fn(&[u8], &[Vec<u8>]) -> anyhow::Result<Vec<u8>>;
+type MergeFn = for<'a> fn(&[u8], &[Cow<'a, [u8]>]) -> anyhow::Result<Vec<u8>>;
 
 fn create_writer(typ: CompressionType, level: Option<u32>, file: File) -> io::Result<Writer<File>> {
     let mut builder = Writer::builder();
@@ -159,8 +160,7 @@ fn merge_into_lmdb_database(
             while let Some((k, v)) = in_iter.next()? {
                 match database.get::<_, ByteSlice, ByteSlice>(wtxn, k)? {
                     Some(old_val) => {
-                        // TODO improve the function signature and avoid allocating here!
-                        let vals = vec![old_val.to_vec(), v.to_vec()];
+                        let vals = vec![Cow::Borrowed(old_val), Cow::Borrowed(v)];
                         let val = merge(k, &vals).expect("merge failed");
                         database.put::<_, ByteSlice, ByteSlice>(wtxn, k, &val)?
                     },
@@ -195,8 +195,7 @@ fn write_into_lmdb_database(
             while let Some((k, v)) = reader.next()? {
                 match database.get::<_, ByteSlice, ByteSlice>(wtxn, k)? {
                     Some(old_val) => {
-                        // TODO improve the function signature and avoid alocating here!
-                        let vals = vec![old_val.to_vec(), v.to_vec()];
+                        let vals = vec![Cow::Borrowed(old_val), Cow::Borrowed(v)];
                         let val = merge(k, &vals).expect("merge failed");
                         database.put::<_, ByteSlice, ByteSlice>(wtxn, k, &val)?
                     },