From 5664c375395e0bc3cce9e5da82f499ed99559cab Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Mon, 7 Sep 2020 15:42:20 +0200 Subject: [PATCH] Introduce an heed codec that reduce the size of small amount of serialized integers --- src/bin/indexer.rs | 16 +++++----- src/bin/infos.rs | 6 ++-- .../byteorder_x_roaring_bitmap_codec.rs | 29 +++++++++++++++++++ src/heed_codec/mod.rs | 6 ++-- src/lib.rs | 7 +++-- 5 files changed, 50 insertions(+), 14 deletions(-) create mode 100644 src/heed_codec/byteorder_x_roaring_bitmap_codec.rs diff --git a/src/bin/indexer.rs b/src/bin/indexer.rs index e66d63d7d..ae67abd99 100644 --- a/src/bin/indexer.rs +++ b/src/bin/indexer.rs @@ -21,7 +21,7 @@ use rayon::prelude::*; use roaring::RoaringBitmap; use structopt::StructOpt; -use milli::heed_codec::CsvStringRecordCodec; +use milli::heed_codec::{CsvStringRecordCodec, ByteorderXRoaringBitmapCodec}; use milli::tokenizer::{simple_tokenizer, only_words}; use milli::{SmallVec32, Index, DocumentId, BEU32}; @@ -197,7 +197,6 @@ impl Store { { // postings positions ids keys are all prefixed let mut key = vec![WORD_DOCID_POSITIONS_BYTE]; - let mut buffer = Vec::new(); // We prefix the words by the document id. key.extend_from_slice(&id.to_be_bytes()); @@ -207,12 +206,11 @@ impl Store { key.truncate(base_size); key.extend_from_slice(word.as_bytes()); // We serialize the positions into a buffer. - buffer.clear(); - buffer.reserve(positions.serialized_size()); - positions.serialize_into(&mut buffer)?; + let bytes = ByteorderXRoaringBitmapCodec::bytes_encode(&positions) + .with_context(|| format!("could not serialize positions"))?; // that we write under the generated key into MTBL if lmdb_key_valid_size(&key) { - sorter.insert(&key, &buffer)?; + sorter.insert(&key, &bytes)?; } } @@ -309,7 +307,11 @@ fn merge(key: &[u8], values: &[Vec]) -> Result, ()> { Ok(values[0].to_vec()) }, key => match key[0] { - DOCUMENTS_IDS_BYTE | WORD_DOCIDS_BYTE | WORD_DOCID_POSITIONS_BYTE => { + WORD_DOCID_POSITIONS_BYTE => { + assert!(values.windows(2).all(|vs| vs[0] == vs[1])); + Ok(values[0].to_vec()) + }, + DOCUMENTS_IDS_BYTE | WORD_DOCIDS_BYTE => { let (head, tail) = values.split_first().unwrap(); let mut head = RoaringBitmap::deserialize_from(head.as_slice()).unwrap(); diff --git a/src/bin/infos.rs b/src/bin/infos.rs index 1cd70d7cd..7fce7d853 100644 --- a/src/bin/infos.rs +++ b/src/bin/infos.rs @@ -257,13 +257,13 @@ fn average_number_of_words_by_doc(index: &Index, rtxn: &heed::RoTxn) -> anyhow:: fn average_number_of_positions(index: &Index, rtxn: &heed::RoTxn) -> anyhow::Result<()> { use heed::types::DecodeIgnore; - use milli::RoaringBitmapCodec; + use milli::ByteorderXRoaringBitmapCodec; let mut values_length = Vec::new(); let mut count = 0; - let iter = index.docid_word_positions.as_polymorph().iter::<_, DecodeIgnore, RoaringBitmapCodec>(rtxn)?; - for result in iter { + let db = index.docid_word_positions.as_polymorph(); + for result in db.iter::<_, DecodeIgnore, ByteorderXRoaringBitmapCodec>(rtxn)? { let ((), val) = result?; values_length.push(val.len() as u32); count += 1; diff --git a/src/heed_codec/byteorder_x_roaring_bitmap_codec.rs b/src/heed_codec/byteorder_x_roaring_bitmap_codec.rs new file mode 100644 index 000000000..4f920959e --- /dev/null +++ b/src/heed_codec/byteorder_x_roaring_bitmap_codec.rs @@ -0,0 +1,29 @@ +use std::borrow::Cow; +use byteorder::{NativeEndian, ReadBytesExt, WriteBytesExt}; +use roaring::RoaringBitmap; + +pub struct ByteorderXRoaringBitmapCodec; + +impl heed::BytesDecode<'_> for ByteorderXRoaringBitmapCodec { + type DItem = RoaringBitmap; + + fn bytes_decode(mut bytes: &[u8]) -> Option { + let mut bitmap = RoaringBitmap::new(); + while let Ok(integer) = bytes.read_u32::() { + bitmap.insert(integer); + } + Some(bitmap) + } +} + +impl heed::BytesEncode<'_> for ByteorderXRoaringBitmapCodec { + type EItem = RoaringBitmap; + + fn bytes_encode(item: &Self::EItem) -> Option> { + let mut bytes = Vec::with_capacity(item.len() as usize * 4); + for integer in item.iter() { + bytes.write_u32::(integer).ok()?; + } + Some(Cow::Owned(bytes)) + } +} diff --git a/src/heed_codec/mod.rs b/src/heed_codec/mod.rs index fe449e23f..10b28fb50 100644 --- a/src/heed_codec/mod.rs +++ b/src/heed_codec/mod.rs @@ -1,7 +1,9 @@ +mod beu32_str_codec; +mod byteorder_x_roaring_bitmap_codec; mod csv_string_record_codec; mod roaring_bitmap_codec; -mod beu32_str_codec; +pub use self::beu32_str_codec::BEU32StrCodec; +pub use self::byteorder_x_roaring_bitmap_codec::ByteorderXRoaringBitmapCodec; pub use self::csv_string_record_codec::CsvStringRecordCodec; pub use self::roaring_bitmap_codec::RoaringBitmapCodec; -pub use self::beu32_str_codec::BEU32StrCodec; diff --git a/src/lib.rs b/src/lib.rs index 051e4c26d..55d2f583e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -15,7 +15,10 @@ use heed::{PolyDatabase, Database}; pub use self::search::{Search, SearchResult}; pub use self::criterion::{Criterion, default_criteria}; -pub use self::heed_codec::{RoaringBitmapCodec, BEU32StrCodec, CsvStringRecordCodec}; +pub use self::heed_codec::{ + RoaringBitmapCodec, BEU32StrCodec, CsvStringRecordCodec, + ByteorderXRoaringBitmapCodec, +}; pub type FastMap4 = HashMap>; pub type FastMap8 = HashMap>; @@ -38,7 +41,7 @@ pub struct Index { /// A word and all the documents ids containing the word. pub word_docids: Database, /// Maps a word and a document id (u32) to all the positions where the given word appears. - pub docid_word_positions: Database, + pub docid_word_positions: Database, /// Maps the document id to the document as a CSV line. pub documents: Database, ByteSlice>, }