mirror of
https://github.com/meilisearch/meilisearch.git
synced 2024-11-25 19:45:05 +08:00
Introduce an heed codec that reduce the size of small amount of serialized integers
This commit is contained in:
parent
3e2250423c
commit
5664c37539
@ -21,7 +21,7 @@ use rayon::prelude::*;
|
|||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
use structopt::StructOpt;
|
use structopt::StructOpt;
|
||||||
|
|
||||||
use milli::heed_codec::CsvStringRecordCodec;
|
use milli::heed_codec::{CsvStringRecordCodec, ByteorderXRoaringBitmapCodec};
|
||||||
use milli::tokenizer::{simple_tokenizer, only_words};
|
use milli::tokenizer::{simple_tokenizer, only_words};
|
||||||
use milli::{SmallVec32, Index, DocumentId, BEU32};
|
use milli::{SmallVec32, Index, DocumentId, BEU32};
|
||||||
|
|
||||||
@ -197,7 +197,6 @@ impl Store {
|
|||||||
{
|
{
|
||||||
// postings positions ids keys are all prefixed
|
// postings positions ids keys are all prefixed
|
||||||
let mut key = vec![WORD_DOCID_POSITIONS_BYTE];
|
let mut key = vec![WORD_DOCID_POSITIONS_BYTE];
|
||||||
let mut buffer = Vec::new();
|
|
||||||
|
|
||||||
// We prefix the words by the document id.
|
// We prefix the words by the document id.
|
||||||
key.extend_from_slice(&id.to_be_bytes());
|
key.extend_from_slice(&id.to_be_bytes());
|
||||||
@ -207,12 +206,11 @@ impl Store {
|
|||||||
key.truncate(base_size);
|
key.truncate(base_size);
|
||||||
key.extend_from_slice(word.as_bytes());
|
key.extend_from_slice(word.as_bytes());
|
||||||
// We serialize the positions into a buffer.
|
// We serialize the positions into a buffer.
|
||||||
buffer.clear();
|
let bytes = ByteorderXRoaringBitmapCodec::bytes_encode(&positions)
|
||||||
buffer.reserve(positions.serialized_size());
|
.with_context(|| format!("could not serialize positions"))?;
|
||||||
positions.serialize_into(&mut buffer)?;
|
|
||||||
// that we write under the generated key into MTBL
|
// that we write under the generated key into MTBL
|
||||||
if lmdb_key_valid_size(&key) {
|
if lmdb_key_valid_size(&key) {
|
||||||
sorter.insert(&key, &buffer)?;
|
sorter.insert(&key, &bytes)?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -309,7 +307,11 @@ fn merge(key: &[u8], values: &[Vec<u8>]) -> Result<Vec<u8>, ()> {
|
|||||||
Ok(values[0].to_vec())
|
Ok(values[0].to_vec())
|
||||||
},
|
},
|
||||||
key => match key[0] {
|
key => match key[0] {
|
||||||
DOCUMENTS_IDS_BYTE | WORD_DOCIDS_BYTE | WORD_DOCID_POSITIONS_BYTE => {
|
WORD_DOCID_POSITIONS_BYTE => {
|
||||||
|
assert!(values.windows(2).all(|vs| vs[0] == vs[1]));
|
||||||
|
Ok(values[0].to_vec())
|
||||||
|
},
|
||||||
|
DOCUMENTS_IDS_BYTE | WORD_DOCIDS_BYTE => {
|
||||||
let (head, tail) = values.split_first().unwrap();
|
let (head, tail) = values.split_first().unwrap();
|
||||||
|
|
||||||
let mut head = RoaringBitmap::deserialize_from(head.as_slice()).unwrap();
|
let mut head = RoaringBitmap::deserialize_from(head.as_slice()).unwrap();
|
||||||
|
@ -257,13 +257,13 @@ fn average_number_of_words_by_doc(index: &Index, rtxn: &heed::RoTxn) -> anyhow::
|
|||||||
|
|
||||||
fn average_number_of_positions(index: &Index, rtxn: &heed::RoTxn) -> anyhow::Result<()> {
|
fn average_number_of_positions(index: &Index, rtxn: &heed::RoTxn) -> anyhow::Result<()> {
|
||||||
use heed::types::DecodeIgnore;
|
use heed::types::DecodeIgnore;
|
||||||
use milli::RoaringBitmapCodec;
|
use milli::ByteorderXRoaringBitmapCodec;
|
||||||
|
|
||||||
let mut values_length = Vec::new();
|
let mut values_length = Vec::new();
|
||||||
let mut count = 0;
|
let mut count = 0;
|
||||||
|
|
||||||
let iter = index.docid_word_positions.as_polymorph().iter::<_, DecodeIgnore, RoaringBitmapCodec>(rtxn)?;
|
let db = index.docid_word_positions.as_polymorph();
|
||||||
for result in iter {
|
for result in db.iter::<_, DecodeIgnore, ByteorderXRoaringBitmapCodec>(rtxn)? {
|
||||||
let ((), val) = result?;
|
let ((), val) = result?;
|
||||||
values_length.push(val.len() as u32);
|
values_length.push(val.len() as u32);
|
||||||
count += 1;
|
count += 1;
|
||||||
|
29
src/heed_codec/byteorder_x_roaring_bitmap_codec.rs
Normal file
29
src/heed_codec/byteorder_x_roaring_bitmap_codec.rs
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
use std::borrow::Cow;
|
||||||
|
use byteorder::{NativeEndian, ReadBytesExt, WriteBytesExt};
|
||||||
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
|
pub struct ByteorderXRoaringBitmapCodec;
|
||||||
|
|
||||||
|
impl heed::BytesDecode<'_> for ByteorderXRoaringBitmapCodec {
|
||||||
|
type DItem = RoaringBitmap;
|
||||||
|
|
||||||
|
fn bytes_decode(mut bytes: &[u8]) -> Option<Self::DItem> {
|
||||||
|
let mut bitmap = RoaringBitmap::new();
|
||||||
|
while let Ok(integer) = bytes.read_u32::<NativeEndian>() {
|
||||||
|
bitmap.insert(integer);
|
||||||
|
}
|
||||||
|
Some(bitmap)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl heed::BytesEncode<'_> for ByteorderXRoaringBitmapCodec {
|
||||||
|
type EItem = RoaringBitmap;
|
||||||
|
|
||||||
|
fn bytes_encode(item: &Self::EItem) -> Option<Cow<[u8]>> {
|
||||||
|
let mut bytes = Vec::with_capacity(item.len() as usize * 4);
|
||||||
|
for integer in item.iter() {
|
||||||
|
bytes.write_u32::<NativeEndian>(integer).ok()?;
|
||||||
|
}
|
||||||
|
Some(Cow::Owned(bytes))
|
||||||
|
}
|
||||||
|
}
|
@ -1,7 +1,9 @@
|
|||||||
|
mod beu32_str_codec;
|
||||||
|
mod byteorder_x_roaring_bitmap_codec;
|
||||||
mod csv_string_record_codec;
|
mod csv_string_record_codec;
|
||||||
mod roaring_bitmap_codec;
|
mod roaring_bitmap_codec;
|
||||||
mod beu32_str_codec;
|
|
||||||
|
|
||||||
|
pub use self::beu32_str_codec::BEU32StrCodec;
|
||||||
|
pub use self::byteorder_x_roaring_bitmap_codec::ByteorderXRoaringBitmapCodec;
|
||||||
pub use self::csv_string_record_codec::CsvStringRecordCodec;
|
pub use self::csv_string_record_codec::CsvStringRecordCodec;
|
||||||
pub use self::roaring_bitmap_codec::RoaringBitmapCodec;
|
pub use self::roaring_bitmap_codec::RoaringBitmapCodec;
|
||||||
pub use self::beu32_str_codec::BEU32StrCodec;
|
|
||||||
|
@ -15,7 +15,10 @@ use heed::{PolyDatabase, Database};
|
|||||||
|
|
||||||
pub use self::search::{Search, SearchResult};
|
pub use self::search::{Search, SearchResult};
|
||||||
pub use self::criterion::{Criterion, default_criteria};
|
pub use self::criterion::{Criterion, default_criteria};
|
||||||
pub use self::heed_codec::{RoaringBitmapCodec, BEU32StrCodec, CsvStringRecordCodec};
|
pub use self::heed_codec::{
|
||||||
|
RoaringBitmapCodec, BEU32StrCodec, CsvStringRecordCodec,
|
||||||
|
ByteorderXRoaringBitmapCodec,
|
||||||
|
};
|
||||||
|
|
||||||
pub type FastMap4<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher32>>;
|
pub type FastMap4<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher32>>;
|
||||||
pub type FastMap8<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher64>>;
|
pub type FastMap8<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher64>>;
|
||||||
@ -38,7 +41,7 @@ pub struct Index {
|
|||||||
/// A word and all the documents ids containing the word.
|
/// A word and all the documents ids containing the word.
|
||||||
pub word_docids: Database<Str, RoaringBitmapCodec>,
|
pub word_docids: Database<Str, RoaringBitmapCodec>,
|
||||||
/// Maps a word and a document id (u32) to all the positions where the given word appears.
|
/// Maps a word and a document id (u32) to all the positions where the given word appears.
|
||||||
pub docid_word_positions: Database<BEU32StrCodec, RoaringBitmapCodec>,
|
pub docid_word_positions: Database<BEU32StrCodec, ByteorderXRoaringBitmapCodec>,
|
||||||
/// Maps the document id to the document as a CSV line.
|
/// Maps the document id to the document as a CSV line.
|
||||||
pub documents: Database<OwnedType<BEU32>, ByteSlice>,
|
pub documents: Database<OwnedType<BEU32>, ByteSlice>,
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user