More efficiently merge MTBLs, more than two at a time

This commit is contained in:
Kerollmops 2020-06-04 16:17:24 +02:00
parent 1df1f88fe1
commit 3a23dc242e
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4
3 changed files with 19 additions and 17 deletions

2
Cargo.lock generated
View File

@ -996,7 +996,7 @@ checksum = "2839e79665f131bdb5782e51f2c6c9599c133c6098982a54c794358bf432529c"
[[package]] [[package]]
name = "oxidized-mtbl" name = "oxidized-mtbl"
version = "0.1.0" version = "0.1.0"
source = "git+https://github.com/Kerollmops/oxidized-mtbl.git?rev=8918476#8918476f61f4430890d067db7b4a6cfb2d549c43" source = "git+https://github.com/Kerollmops/oxidized-mtbl.git?rev=6acef3d#6acef3d0fc7fec6a3701038860e51f8bbcee1ee6"
dependencies = [ dependencies = [
"byteorder 1.3.4", "byteorder 1.3.4",
"crc32c", "crc32c",

View File

@ -18,7 +18,7 @@ jemallocator = "0.3.2"
levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] } levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] }
memmap = "0.7.0" memmap = "0.7.0"
once_cell = "1.4.0" once_cell = "1.4.0"
oxidized-mtbl = { git = "https://github.com/Kerollmops/oxidized-mtbl.git", rev = "8918476" } oxidized-mtbl = { git = "https://github.com/Kerollmops/oxidized-mtbl.git", rev = "6acef3d" }
rayon = "1.3.0" rayon = "1.3.0"
roaring = "0.5.2" roaring = "0.5.2"
slice-group-by = "0.2.6" slice-group-by = "0.2.6"

View File

@ -100,36 +100,38 @@ impl MtblKvStore {
Ok(MtblKvStore(Some(out))) Ok(MtblKvStore(Some(out)))
} }
fn merge(key: &[u8], left: &[u8], right: &[u8]) -> Option<Vec<u8>> { fn merge(key: &[u8], values: &[Vec<u8>]) -> Option<Vec<u8>> {
if key == b"\0words-fst" { if key == b"\0words-fst" {
let left_fst = fst::Set::new(left).unwrap(); let fsts: Vec<_> = values.iter().map(|v| fst::Set::new(v).unwrap()).collect();
let right_fst = fst::Set::new(right).unwrap();
// Union of the two FSTs // Union of the two FSTs
let op = fst::set::OpBuilder::new() let mut op = fst::set::OpBuilder::new();
.add(left_fst.into_stream()) fsts.iter().for_each(|fst| op.push(fst.into_stream()));
.add(right_fst.into_stream()) let op = op.r#union();
.r#union();
let mut build = fst::SetBuilder::memory(); let mut build = fst::SetBuilder::memory();
build.extend_stream(op.into_stream()).unwrap(); build.extend_stream(op.into_stream()).unwrap();
Some(build.into_inner().unwrap()) Some(build.into_inner().unwrap())
} }
else if key == b"\0headers" { else if key == b"\0headers" {
assert_eq!(left, right); assert!(values.windows(2).all(|vs| vs[0] == vs[1]));
Some(left.to_vec()) Some(values[0].to_vec())
} }
else if key.starts_with(&[1]) || key.starts_with(&[2]) { else if key.starts_with(&[1]) || key.starts_with(&[2]) {
let mut left = RoaringBitmap::deserialize_from(left).unwrap(); let mut first = RoaringBitmap::deserialize_from(values[0].as_slice()).unwrap();
let right = RoaringBitmap::deserialize_from(right).unwrap();
left.union_with(&right); for value in &values[1..] {
let bitmap = RoaringBitmap::deserialize_from(value.as_slice()).unwrap();
first.union_with(&bitmap);
}
let mut vec = Vec::new(); let mut vec = Vec::new();
left.serialize_into(&mut vec).unwrap(); first.serialize_into(&mut vec).unwrap();
Some(vec) Some(vec)
} }
else if key.starts_with(&[3]) { else if key.starts_with(&[3]) {
assert_eq!(left, right); assert!(values.windows(2).all(|vs| vs[0] == vs[1]));
Some(left.to_vec()) Some(values[0].to_vec())
} }
else { else {
panic!("wut? {:?}", key) panic!("wut? {:?}", key)