From d0c73564b158d8ac78195e2abe6041afd099e2e5 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Thu, 1 Oct 2020 11:14:26 +0200 Subject: [PATCH] Use the CboRoaringBitmapCodec for the word pair proximity docids --- src/bin/indexer.rs | 25 ++++++-- src/bin/infos.rs | 4 +- src/heed_codec/cbo_roaring_bitmap_codec.rs | 67 ++++++++++++++++------ src/lib.rs | 4 +- 4 files changed, 71 insertions(+), 29 deletions(-) diff --git a/src/bin/indexer.rs b/src/bin/indexer.rs index 004f733b3..8ca1a768b 100644 --- a/src/bin/indexer.rs +++ b/src/bin/indexer.rs @@ -21,7 +21,7 @@ use rayon::prelude::*; use roaring::RoaringBitmap; use structopt::StructOpt; -use milli::heed_codec::{CsvStringRecordCodec, BoRoaringBitmapCodec}; +use milli::heed_codec::{CsvStringRecordCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec}; use milli::tokenizer::{simple_tokenizer, only_token}; use milli::{SmallVec32, Index, Position, DocumentId, BEU32}; @@ -335,8 +335,8 @@ impl Store { key.push(min_prox); // We serialize the document ids into a buffer buffer.clear(); - buffer.reserve(docids.serialized_size()); - docids.serialize_into(&mut buffer)?; + buffer.reserve(CboRoaringBitmapCodec::serialized_size(&docids)); + CboRoaringBitmapCodec::serialize_into(&docids, &mut buffer)?; // that we write under the generated key into MTBL if lmdb_key_valid_size(&key) { sorter.insert(&key, &buffer)?; @@ -365,7 +365,7 @@ impl Store { // We serialize the positions into a buffer. let positions = RoaringBitmap::from_iter(positions.iter().cloned()); let bytes = BoRoaringBitmapCodec::bytes_encode(&positions) - .with_context(|| format!("could not serialize positions"))?; + .with_context(|| "could not serialize positions")?; // that we write under the generated key into MTBL if lmdb_key_valid_size(&key) { sorter.insert(&key, &bytes)?; @@ -515,10 +515,10 @@ fn merge(key: &[u8], values: &[Vec]) -> Result, ()> { assert!(values.windows(2).all(|vs| vs[0] == vs[1])); Ok(values[0].to_vec()) }, - DOCUMENTS_IDS_BYTE | WORD_DOCIDS_BYTE | WORDS_PROXIMITIES_BYTE => { + DOCUMENTS_IDS_BYTE | WORD_DOCIDS_BYTE => { let (head, tail) = values.split_first().unwrap(); - let mut head = RoaringBitmap::deserialize_from(head.as_slice()).unwrap(); + for value in tail { let bitmap = RoaringBitmap::deserialize_from(value.as_slice()).unwrap(); head.union_with(&bitmap); @@ -528,6 +528,19 @@ fn merge(key: &[u8], values: &[Vec]) -> Result, ()> { head.serialize_into(&mut vec).unwrap(); Ok(vec) }, + WORDS_PROXIMITIES_BYTE => { + let (head, tail) = values.split_first().unwrap(); + let mut head = CboRoaringBitmapCodec::deserialize_from(head.as_slice()).unwrap(); + + for value in tail { + let bitmap = CboRoaringBitmapCodec::deserialize_from(value.as_slice()).unwrap(); + head.union_with(&bitmap); + } + + let mut vec = Vec::new(); + CboRoaringBitmapCodec::serialize_into(&head, &mut vec).unwrap(); + Ok(vec) + }, otherwise => panic!("wut {:?}", otherwise), } } diff --git a/src/bin/infos.rs b/src/bin/infos.rs index 831e14110..4cfe9d09e 100644 --- a/src/bin/infos.rs +++ b/src/bin/infos.rs @@ -385,12 +385,12 @@ fn size_of_database(index: &Index, rtxn: &heed::RoTxn, name: &str) -> anyhow::Re fn word_pair_proximity_stats(index: &Index, rtxn: &heed::RoTxn) -> anyhow::Result<()> { use heed::types::DecodeIgnore; - use milli::RoaringBitmapCodec; + use milli::CboRoaringBitmapCodec; let mut values_length = Vec::new(); let db = index.word_pair_proximity_docids.as_polymorph(); - for result in db.iter::<_, DecodeIgnore, RoaringBitmapCodec>(rtxn)? { + for result in db.iter::<_, DecodeIgnore, CboRoaringBitmapCodec>(rtxn)? { let ((), val) = result?; values_length.push(val.len() as u32); } diff --git a/src/heed_codec/cbo_roaring_bitmap_codec.rs b/src/heed_codec/cbo_roaring_bitmap_codec.rs index 9d1a0486a..337f4df48 100644 --- a/src/heed_codec/cbo_roaring_bitmap_codec.rs +++ b/src/heed_codec/cbo_roaring_bitmap_codec.rs @@ -1,25 +1,60 @@ use std::borrow::Cow; +use std::io; use std::mem::size_of; + +use byteorder::{NativeEndian, ReadBytesExt, WriteBytesExt}; use roaring::RoaringBitmap; -use super::{BoRoaringBitmapCodec, RoaringBitmapCodec}; /// A conditionnal codec that either use the RoaringBitmap /// or a lighter ByteOrder en/decoding method. pub struct CboRoaringBitmapCodec; +impl CboRoaringBitmapCodec { + pub fn serialized_size(roaring: &RoaringBitmap) -> usize { + if roaring.len() <= 4 { + roaring.len() as usize * size_of::() + } else { + roaring.serialized_size() + } + } + + pub fn serialize_into(roaring: &RoaringBitmap, vec: &mut Vec) -> io::Result<()> { + if roaring.len() <= 4 { + // If the number of items (u32s) to encode is less than or equal to 4 + // it means that it would weigh the same or less than the RoaringBitmap + // header, so we directly encode them using ByteOrder instead. + for integer in roaring { + vec.write_u32::(integer)?; + } + Ok(()) + } else { + // Otherwise, we use the classic RoaringBitmapCodec that writes a header. + roaring.serialize_into(vec) + } + } + + pub fn deserialize_from(mut bytes: &[u8]) -> io::Result { + if bytes.len() <= 4 * size_of::() { + // If there is 4 or less than 4 integers that can fit into this array + // of bytes it means that we used the ByteOrder codec serializer. + let mut bitmap = RoaringBitmap::new(); + while let Ok(integer) = bytes.read_u32::() { + bitmap.insert(integer); + } + Ok(bitmap) + } else { + // Otherwise, it means we used the classic RoaringBitmapCodec and + // that the header takes 4 integers. + RoaringBitmap::deserialize_from(bytes) + } + } +} + impl heed::BytesDecode<'_> for CboRoaringBitmapCodec { type DItem = RoaringBitmap; fn bytes_decode(bytes: &[u8]) -> Option { - if bytes.len() <= 4 * size_of::() { - // If there is 4 or less than 4 integers that can fit into this array - // of bytes it means that we used the ByteOrder codec serializer. - BoRoaringBitmapCodec::bytes_decode(bytes) - } else { - // Otherwise, it means we used the classic RoaringBitmapCodec and - // that the header takes 4 integers. - RoaringBitmapCodec::bytes_decode(bytes) - } + Self::deserialize_from(bytes).ok() } } @@ -27,14 +62,8 @@ impl heed::BytesEncode<'_> for CboRoaringBitmapCodec { type EItem = RoaringBitmap; fn bytes_encode(item: &Self::EItem) -> Option> { - if item.len() <= 4 { - // If the number of items (u32s) to encode is less than or equal to 4 - // it means that it would weigh the same or less than the RoaringBitmap - // header, so we directly encode them using ByteOrder instead. - BoRoaringBitmapCodec::bytes_encode(item) - } else { - // Otherwise, we use the classic RoaringBitmapCodec that writes a header. - RoaringBitmapCodec::bytes_encode(item) - } + let mut vec = Vec::with_capacity(Self::serialized_size(item)); + Self::serialize_into(item, &mut vec).ok()?; + Some(Cow::Owned(vec)) } } diff --git a/src/lib.rs b/src/lib.rs index 1bc42928c..547189aa3 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -18,7 +18,7 @@ pub use self::search::{Search, SearchResult}; pub use self::criterion::{Criterion, default_criteria}; pub use self::heed_codec::{ RoaringBitmapCodec, BEU32StrCodec, StrStrU8Codec, - CsvStringRecordCodec, BoRoaringBitmapCodec, + CsvStringRecordCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, }; pub type FastMap4 = HashMap>; @@ -44,7 +44,7 @@ pub struct Index { /// Maps a word and a document id (u32) to all the positions where the given word appears. pub docid_word_positions: Database, /// Maps the proximity between a pair of words with all the docids where this relation appears. - pub word_pair_proximity_docids: Database, + pub word_pair_proximity_docids: Database, /// Maps the document id to the document as a CSV line. pub documents: Database, ByteSlice>, }