mirror of
https://github.com/meilisearch/meilisearch.git
synced 2024-11-27 04:25:06 +08:00
Merge pull request #87 from meilisearch/roaring-bitmap-length
Introduce fast methods to get roaring bitmap lengths
This commit is contained in:
commit
09ca5d14c9
@ -1,14 +1,12 @@
|
||||
mod beu32_str_codec;
|
||||
mod bo_roaring_bitmap_codec;
|
||||
mod cbo_roaring_bitmap_codec;
|
||||
mod obkv_codec;
|
||||
mod roaring_bitmap_codec;
|
||||
mod roaring_bitmap;
|
||||
mod roaring_bitmap_length;
|
||||
mod str_str_u8_codec;
|
||||
pub mod facet;
|
||||
|
||||
pub use self::beu32_str_codec::BEU32StrCodec;
|
||||
pub use self::bo_roaring_bitmap_codec::BoRoaringBitmapCodec;
|
||||
pub use self::cbo_roaring_bitmap_codec::CboRoaringBitmapCodec;
|
||||
pub use self::obkv_codec::ObkvCodec;
|
||||
pub use self::roaring_bitmap_codec::RoaringBitmapCodec;
|
||||
pub use self::roaring_bitmap::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, RoaringBitmapCodec};
|
||||
pub use self::roaring_bitmap_length::{BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec, RoaringBitmapLenCodec};
|
||||
pub use self::str_str_u8_codec::StrStrU8Codec;
|
||||
|
@ -8,7 +8,7 @@ use roaring::RoaringBitmap;
|
||||
/// This is the limit where using a byteorder became less size efficient
|
||||
/// than using a direct roaring encoding, it is also the point where we are able
|
||||
/// to determine the encoding used only by using the array of bytes length.
|
||||
const THRESHOLD: usize = 7;
|
||||
pub const THRESHOLD: usize = 7;
|
||||
|
||||
/// A conditionnal codec that either use the RoaringBitmap
|
||||
/// or a lighter ByteOrder en/decoding method.
|
7
milli/src/heed_codec/roaring_bitmap/mod.rs
Normal file
7
milli/src/heed_codec/roaring_bitmap/mod.rs
Normal file
@ -0,0 +1,7 @@
|
||||
mod bo_roaring_bitmap_codec;
|
||||
pub mod cbo_roaring_bitmap_codec;
|
||||
mod roaring_bitmap_codec;
|
||||
|
||||
pub use self::bo_roaring_bitmap_codec::BoRoaringBitmapCodec;
|
||||
pub use self::cbo_roaring_bitmap_codec::CboRoaringBitmapCodec;
|
||||
pub use self::roaring_bitmap_codec::RoaringBitmapCodec;
|
@ -0,0 +1,11 @@
|
||||
use std::mem;
|
||||
|
||||
pub struct BoRoaringBitmapLenCodec;
|
||||
|
||||
impl heed::BytesDecode<'_> for BoRoaringBitmapLenCodec {
|
||||
type DItem = u64;
|
||||
|
||||
fn bytes_decode(bytes: &[u8]) -> Option<Self::DItem> {
|
||||
Some((bytes.len() / mem::size_of::<u32>()) as u64)
|
||||
}
|
||||
}
|
@ -0,0 +1,22 @@
|
||||
use std::mem;
|
||||
|
||||
use super::{BoRoaringBitmapLenCodec, RoaringBitmapLenCodec};
|
||||
use crate::heed_codec::roaring_bitmap::cbo_roaring_bitmap_codec::THRESHOLD;
|
||||
|
||||
pub struct CboRoaringBitmapLenCodec;
|
||||
|
||||
impl heed::BytesDecode<'_> for CboRoaringBitmapLenCodec {
|
||||
type DItem = u64;
|
||||
|
||||
fn bytes_decode(bytes: &[u8]) -> Option<Self::DItem> {
|
||||
if bytes.len() <= THRESHOLD * mem::size_of::<u32>() {
|
||||
// If there is threshold or less than threshold integers that can fit into this array
|
||||
// of bytes it means that we used the ByteOrder codec serializer.
|
||||
BoRoaringBitmapLenCodec::bytes_decode(bytes)
|
||||
} else {
|
||||
// Otherwise, it means we used the classic RoaringBitmapCodec and
|
||||
// that the header takes threshold integers.
|
||||
RoaringBitmapLenCodec::bytes_decode(bytes)
|
||||
}
|
||||
}
|
||||
}
|
7
milli/src/heed_codec/roaring_bitmap_length/mod.rs
Normal file
7
milli/src/heed_codec/roaring_bitmap_length/mod.rs
Normal file
@ -0,0 +1,7 @@
|
||||
mod bo_roaring_bitmap_len_codec;
|
||||
mod cbo_roaring_bitmap_len_codec;
|
||||
mod roaring_bitmap_len_codec;
|
||||
|
||||
pub use self::bo_roaring_bitmap_len_codec::BoRoaringBitmapLenCodec;
|
||||
pub use self::cbo_roaring_bitmap_len_codec::CboRoaringBitmapLenCodec;
|
||||
pub use self::roaring_bitmap_len_codec::RoaringBitmapLenCodec;
|
@ -0,0 +1,83 @@
|
||||
use std::io::{self, Read, BufRead};
|
||||
use std::mem;
|
||||
|
||||
use byteorder::{ReadBytesExt, LittleEndian};
|
||||
|
||||
const SERIAL_COOKIE_NO_RUNCONTAINER: u32 = 12346;
|
||||
const SERIAL_COOKIE: u16 = 12347;
|
||||
|
||||
pub struct RoaringBitmapLenCodec;
|
||||
|
||||
impl RoaringBitmapLenCodec {
|
||||
// FIXME should be exported in the RoaringBitmap crate
|
||||
fn deserialize_from_slice(mut bytes: &[u8]) -> io::Result<u64> {
|
||||
let (size, has_offsets) = {
|
||||
let cookie = bytes.read_u32::<LittleEndian>()?;
|
||||
if cookie == SERIAL_COOKIE_NO_RUNCONTAINER {
|
||||
(bytes.read_u32::<LittleEndian>()? as usize, true)
|
||||
} else if (cookie as u16) == SERIAL_COOKIE {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::Other,
|
||||
"run containers are unsupported",
|
||||
));
|
||||
} else {
|
||||
return Err(io::Error::new(io::ErrorKind::Other, "unknown cookie value"));
|
||||
}
|
||||
};
|
||||
|
||||
if size > u16::max_value() as usize + 1 {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::Other,
|
||||
"size is greater than supported",
|
||||
));
|
||||
}
|
||||
|
||||
let mut description_bytes = vec![0u8; size * 4];
|
||||
bytes.read_exact(&mut description_bytes)?;
|
||||
let description_bytes = &mut &description_bytes[..];
|
||||
|
||||
if has_offsets {
|
||||
bytes.consume(size * 4);
|
||||
}
|
||||
|
||||
let mut length = 0;
|
||||
for _ in 0..size {
|
||||
let _key = description_bytes.read_u16::<LittleEndian>()?;
|
||||
let len = u64::from(description_bytes.read_u16::<LittleEndian>()?) + 1;
|
||||
length += len;
|
||||
|
||||
if len <= 4096 {
|
||||
bytes.consume(len as usize * mem::size_of::<u16>());
|
||||
} else {
|
||||
bytes.consume(1024 * mem::size_of::<u64>())
|
||||
}
|
||||
}
|
||||
|
||||
Ok(length)
|
||||
}
|
||||
}
|
||||
|
||||
impl heed::BytesDecode<'_> for RoaringBitmapLenCodec {
|
||||
type DItem = u64;
|
||||
|
||||
fn bytes_decode(bytes: &[u8]) -> Option<Self::DItem> {
|
||||
RoaringBitmapLenCodec::deserialize_from_slice(bytes).ok()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
use crate::heed_codec::RoaringBitmapCodec;
|
||||
use heed::BytesEncode;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
#[test]
|
||||
fn deserialize_roaring_bitmap_length() {
|
||||
let bitmap: RoaringBitmap = (0..500).chain(800..800_000).chain(920_056..930_032).collect();
|
||||
let bytes = RoaringBitmapCodec::bytes_encode(&bitmap).unwrap();
|
||||
let len = RoaringBitmapLenCodec::deserialize_from_slice(&bytes).unwrap();
|
||||
assert_eq!(bitmap.len(), len);
|
||||
}
|
||||
}
|
@ -12,8 +12,8 @@ use crate::fields_ids_map::FieldsIdsMap;
|
||||
use crate::{default_criteria, Criterion, Search, FacetDistribution};
|
||||
use crate::{BEU32, DocumentId, FieldId, ExternalDocumentsIds};
|
||||
use crate::{
|
||||
RoaringBitmapCodec, BEU32StrCodec, StrStrU8Codec, ObkvCodec,
|
||||
BoRoaringBitmapCodec, CboRoaringBitmapCodec,
|
||||
RoaringBitmapCodec, RoaringBitmapLenCodec, BEU32StrCodec,
|
||||
StrStrU8Codec, ObkvCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec,
|
||||
};
|
||||
|
||||
pub const CRITERIA_KEY: &str = "criteria";
|
||||
@ -352,6 +352,17 @@ impl Index {
|
||||
}
|
||||
}
|
||||
|
||||
/* word documents count */
|
||||
|
||||
/// Returns the number of documents ids associated with the given word,
|
||||
/// it is much faster than deserializing the bitmap and getting the length of it.
|
||||
pub fn word_documents_count(&self, rtxn: &RoTxn, word: &str) -> anyhow::Result<Option<u64>> {
|
||||
self.word_docids
|
||||
.remap_data_type::<RoaringBitmapLenCodec>()
|
||||
.get(rtxn, word)
|
||||
.map_err(Into::into)
|
||||
}
|
||||
|
||||
/* documents */
|
||||
|
||||
/// Returns a [`Vec`] of the requested documents. Returns an error if a document is missing.
|
||||
|
@ -26,6 +26,7 @@ pub use self::external_documents_ids::ExternalDocumentsIds;
|
||||
pub use self::fields_ids_map::FieldsIdsMap;
|
||||
pub use self::heed_codec::{BEU32StrCodec, StrStrU8Codec, ObkvCodec};
|
||||
pub use self::heed_codec::{RoaringBitmapCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec};
|
||||
pub use self::heed_codec::{RoaringBitmapLenCodec, BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec};
|
||||
pub use self::index::Index;
|
||||
pub use self::search::{Search, FacetDistribution, FacetCondition, SearchResult};
|
||||
pub use self::update_store::UpdateStore;
|
||||
|
Loading…
Reference in New Issue
Block a user