Introduce the roaring bitmap heed codec

This commit is contained in:
Kerollmops 2020-06-22 17:56:07 +02:00
parent 8148210860
commit 2f0e1afd16
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4
3 changed files with 34 additions and 9 deletions

3
src/heed_codec/mod.rs Normal file
View File

@ -0,0 +1,3 @@
mod roaring_bitmap;
pub use self::roaring_bitmap::RoaringBitmapCodec;

View File

@ -0,0 +1,22 @@
use std::borrow::Cow;
use roaring::RoaringBitmap;
pub struct RoaringBitmapCodec;
impl heed::BytesDecode<'_> for RoaringBitmapCodec {
type DItem = RoaringBitmap;
fn bytes_decode(bytes: &[u8]) -> Option<Self::DItem> {
RoaringBitmap::deserialize_from(bytes).ok()
}
}
impl heed::BytesEncode<'_> for RoaringBitmapCodec {
type EItem = RoaringBitmap;
fn bytes_encode(item: &Self::EItem) -> Option<Cow<[u8]>> {
let mut bytes = Vec::new();
item.serialize_into(&mut bytes).ok()?;
Some(Cow::Owned(bytes))
}
}

View File

@ -1,4 +1,5 @@
mod best_proximity; mod best_proximity;
mod heed_codec;
mod iter_shortest_paths; mod iter_shortest_paths;
mod query_tokens; mod query_tokens;
@ -16,8 +17,9 @@ use levenshtein_automata::LevenshteinAutomatonBuilder as LevBuilder;
use once_cell::sync::Lazy; use once_cell::sync::Lazy;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use self::query_tokens::{QueryTokens, QueryToken};
use self::best_proximity::BestProximity; use self::best_proximity::BestProximity;
use self::heed_codec::RoaringBitmapCodec;
use self::query_tokens::{QueryTokens, QueryToken};
// Building these factories is not free. // Building these factories is not free.
static LEVDIST0: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(0, true)); static LEVDIST0: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(0, true));
@ -35,10 +37,10 @@ pub type AttributeId = u32;
#[derive(Clone)] #[derive(Clone)]
pub struct Index { pub struct Index {
pub main: PolyDatabase, pub main: PolyDatabase,
pub postings_attrs: Database<Str, ByteSlice>, pub postings_attrs: Database<Str, RoaringBitmapCodec>,
pub prefix_postings_attrs: Database<ByteSlice, ByteSlice>, pub prefix_postings_attrs: Database<ByteSlice, RoaringBitmapCodec>,
pub postings_ids: Database<ByteSlice, ByteSlice>, pub postings_ids: Database<ByteSlice, RoaringBitmapCodec>,
pub prefix_postings_ids: Database<ByteSlice, ByteSlice>, pub prefix_postings_ids: Database<ByteSlice, RoaringBitmapCodec>,
pub documents: Database<OwnedType<BEU32>, ByteSlice>, pub documents: Database<OwnedType<BEU32>, ByteSlice>,
} }
@ -105,8 +107,7 @@ impl Index {
let mut stream = fst.search(&dfa).into_stream(); let mut stream = fst.search(&dfa).into_stream();
while let Some(word) = stream.next() { while let Some(word) = stream.next() {
let word = std::str::from_utf8(word)?; let word = std::str::from_utf8(word)?;
if let Some(attrs) = self.postings_attrs.get(rtxn, word)? { if let Some(right) = self.postings_attrs.get(rtxn, word)? {
let right = RoaringBitmap::deserialize_from_slice(attrs)?;
union_positions.union_with(&right); union_positions.union_with(&right);
derived_words.push((word.as_bytes().to_vec(), right)); derived_words.push((word.as_bytes().to_vec(), right));
count += 1; count += 1;
@ -130,8 +131,7 @@ impl Index {
if attrs.contains(pos) { if attrs.contains(pos) {
let mut key = word.clone(); let mut key = word.clone();
key.extend_from_slice(&pos.to_be_bytes()); key.extend_from_slice(&pos.to_be_bytes());
if let Some(attrs) = self.postings_ids.get(rtxn, &key).unwrap() { if let Some(right) = self.postings_ids.get(rtxn, &key).unwrap() {
let right = RoaringBitmap::deserialize_from_slice(attrs).unwrap();
union_docids.union_with(&right); union_docids.union_with(&right);
} }
} }