2023-04-04 21:01:42 +08:00
|
|
|
use std::borrow::Cow;
|
2023-03-08 16:55:53 +08:00
|
|
|
use std::collections::hash_map::Entry;
|
2023-03-08 20:26:29 +08:00
|
|
|
use std::hash::Hash;
|
2023-03-08 16:55:53 +08:00
|
|
|
|
2023-02-21 19:55:44 +08:00
|
|
|
use fxhash::FxHashMap;
|
2023-02-21 20:57:34 +08:00
|
|
|
use heed::types::ByteSlice;
|
2023-04-11 21:26:47 +08:00
|
|
|
use heed::{BytesDecode, BytesEncode, Database, RoTxn};
|
|
|
|
use roaring::RoaringBitmap;
|
2023-03-08 16:55:53 +08:00
|
|
|
|
2023-03-30 17:08:17 +08:00
|
|
|
use super::interner::Interned;
|
2023-04-12 00:26:44 +08:00
|
|
|
use super::Word;
|
2023-04-12 22:53:11 +08:00
|
|
|
use crate::heed_codec::StrBEU16Codec;
|
2023-04-11 21:26:47 +08:00
|
|
|
use crate::{
|
|
|
|
CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, Result, RoaringBitmapCodec, SearchContext,
|
|
|
|
};
|
2023-02-21 16:43:23 +08:00
|
|
|
|
2023-03-08 20:26:29 +08:00
|
|
|
/// A cache storing pointers to values in the LMDB databases.
|
|
|
|
///
|
|
|
|
/// Used for performance reasons only. By using this cache, we avoid performing a
|
|
|
|
/// database lookup and instead get a direct reference to the value using a fast
|
|
|
|
/// local HashMap lookup.
|
2023-02-21 16:43:23 +08:00
|
|
|
#[derive(Default)]
|
2023-03-13 21:03:48 +08:00
|
|
|
pub struct DatabaseCache<'ctx> {
|
2023-03-07 02:21:55 +08:00
|
|
|
pub word_pair_proximity_docids:
|
2023-03-13 21:03:48 +08:00
|
|
|
FxHashMap<(u8, Interned<String>, Interned<String>), Option<&'ctx [u8]>>,
|
2023-02-21 16:43:23 +08:00
|
|
|
pub word_prefix_pair_proximity_docids:
|
2023-03-13 21:03:48 +08:00
|
|
|
FxHashMap<(u8, Interned<String>, Interned<String>), Option<&'ctx [u8]>>,
|
2023-02-21 22:10:22 +08:00
|
|
|
pub prefix_word_pair_proximity_docids:
|
2023-03-13 21:03:48 +08:00
|
|
|
FxHashMap<(u8, Interned<String>, Interned<String>), Option<&'ctx [u8]>>,
|
|
|
|
pub word_docids: FxHashMap<Interned<String>, Option<&'ctx [u8]>>,
|
|
|
|
pub exact_word_docids: FxHashMap<Interned<String>, Option<&'ctx [u8]>>,
|
|
|
|
pub word_prefix_docids: FxHashMap<Interned<String>, Option<&'ctx [u8]>>,
|
2023-04-12 04:04:38 +08:00
|
|
|
pub exact_word_prefix_docids: FxHashMap<Interned<String>, Option<&'ctx [u8]>>,
|
2023-04-04 21:01:42 +08:00
|
|
|
|
|
|
|
pub words_fst: Option<fst::Set<Cow<'ctx, [u8]>>>,
|
2023-04-04 23:04:46 +08:00
|
|
|
pub word_position_docids: FxHashMap<(Interned<String>, u16), Option<&'ctx [u8]>>,
|
|
|
|
pub word_fid_docids: FxHashMap<(Interned<String>, u16), Option<&'ctx [u8]>>,
|
2023-04-12 17:40:44 +08:00
|
|
|
pub word_prefix_fid_docids: FxHashMap<(Interned<String>, u16), Option<&'ctx [u8]>>,
|
|
|
|
pub word_fids: FxHashMap<Interned<String>, Vec<u16>>,
|
|
|
|
pub word_prefix_fids: FxHashMap<Interned<String>, Vec<u16>>,
|
2023-02-21 16:43:23 +08:00
|
|
|
}
|
2023-03-13 21:03:48 +08:00
|
|
|
impl<'ctx> DatabaseCache<'ctx> {
|
2023-03-08 20:26:29 +08:00
|
|
|
fn get_value<'v, K1, KC>(
|
2023-03-13 21:03:48 +08:00
|
|
|
txn: &'ctx RoTxn,
|
2023-03-08 20:26:29 +08:00
|
|
|
cache_key: K1,
|
|
|
|
db_key: &'v KC::EItem,
|
2023-03-13 21:03:48 +08:00
|
|
|
cache: &mut FxHashMap<K1, Option<&'ctx [u8]>>,
|
2023-03-08 20:26:29 +08:00
|
|
|
db: Database<KC, ByteSlice>,
|
2023-03-13 21:03:48 +08:00
|
|
|
) -> Result<Option<&'ctx [u8]>>
|
2023-03-08 20:26:29 +08:00
|
|
|
where
|
|
|
|
K1: Copy + Eq + Hash,
|
|
|
|
KC: BytesEncode<'v>,
|
|
|
|
{
|
|
|
|
let bitmap_ptr = match cache.entry(cache_key) {
|
2023-02-21 16:43:23 +08:00
|
|
|
Entry::Occupied(bitmap_ptr) => *bitmap_ptr.get(),
|
|
|
|
Entry::Vacant(entry) => {
|
2023-03-08 20:26:29 +08:00
|
|
|
let bitmap_ptr = db.get(txn, db_key)?;
|
2023-02-21 16:43:23 +08:00
|
|
|
entry.insert(bitmap_ptr);
|
|
|
|
bitmap_ptr
|
|
|
|
}
|
|
|
|
};
|
|
|
|
Ok(bitmap_ptr)
|
|
|
|
}
|
2023-03-30 17:08:17 +08:00
|
|
|
}
|
|
|
|
impl<'ctx> SearchContext<'ctx> {
|
2023-04-04 21:01:42 +08:00
|
|
|
pub fn get_words_fst(&mut self) -> Result<fst::Set<Cow<'ctx, [u8]>>> {
|
|
|
|
if let Some(fst) = self.db_cache.words_fst.clone() {
|
|
|
|
Ok(fst)
|
|
|
|
} else {
|
|
|
|
let fst = self.index.words_fst(self.txn)?;
|
|
|
|
self.db_cache.words_fst = Some(fst.clone());
|
|
|
|
Ok(fst)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-04-12 00:26:44 +08:00
|
|
|
pub fn word_docids(&mut self, word: Word) -> Result<Option<RoaringBitmap>> {
|
|
|
|
match word {
|
|
|
|
Word::Original(word) => {
|
|
|
|
let exact = self.get_db_exact_word_docids(word)?;
|
|
|
|
let tolerant = self.get_db_word_docids(word)?;
|
|
|
|
Ok(match (exact, tolerant) {
|
|
|
|
(None, None) => None,
|
|
|
|
(None, Some(tolerant)) => Some(tolerant),
|
|
|
|
(Some(exact), None) => Some(exact),
|
|
|
|
(Some(exact), Some(tolerant)) => {
|
|
|
|
let mut both = exact;
|
|
|
|
both |= tolerant;
|
|
|
|
Some(both)
|
|
|
|
}
|
|
|
|
})
|
|
|
|
}
|
|
|
|
Word::Derived(word) => self.get_db_word_docids(word),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-03-08 20:26:29 +08:00
|
|
|
/// Retrieve or insert the given value in the `word_docids` database.
|
2023-04-12 00:27:41 +08:00
|
|
|
fn get_db_word_docids(&mut self, word: Interned<String>) -> Result<Option<RoaringBitmap>> {
|
2023-03-30 17:08:17 +08:00
|
|
|
DatabaseCache::get_value(
|
|
|
|
self.txn,
|
2023-03-08 20:26:29 +08:00
|
|
|
word,
|
2023-03-30 17:08:17 +08:00
|
|
|
self.word_interner.get(word).as_str(),
|
|
|
|
&mut self.db_cache.word_docids,
|
|
|
|
self.index.word_docids.remap_data_type::<ByteSlice>(),
|
2023-04-11 21:26:47 +08:00
|
|
|
)?
|
|
|
|
.map(|bytes| RoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into()))
|
|
|
|
.transpose()
|
2023-03-08 20:26:29 +08:00
|
|
|
}
|
2023-04-12 00:26:44 +08:00
|
|
|
|
|
|
|
fn get_db_exact_word_docids(
|
|
|
|
&mut self,
|
|
|
|
word: Interned<String>,
|
|
|
|
) -> Result<Option<RoaringBitmap>> {
|
|
|
|
DatabaseCache::get_value(
|
|
|
|
self.txn,
|
|
|
|
word,
|
|
|
|
self.word_interner.get(word).as_str(),
|
|
|
|
&mut self.db_cache.exact_word_docids,
|
|
|
|
self.index.exact_word_docids.remap_data_type::<ByteSlice>(),
|
|
|
|
)?
|
|
|
|
.map(|bytes| RoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into()))
|
|
|
|
.transpose()
|
|
|
|
}
|
|
|
|
|
2023-04-12 04:04:38 +08:00
|
|
|
pub fn word_prefix_docids(&mut self, prefix: Word) -> Result<Option<RoaringBitmap>> {
|
|
|
|
match prefix {
|
|
|
|
Word::Original(prefix) => {
|
|
|
|
let exact = self.get_db_exact_word_prefix_docids(prefix)?;
|
|
|
|
let tolerant = self.get_db_word_prefix_docids(prefix)?;
|
|
|
|
Ok(match (exact, tolerant) {
|
|
|
|
(None, None) => None,
|
|
|
|
(None, Some(tolerant)) => Some(tolerant),
|
|
|
|
(Some(exact), None) => Some(exact),
|
|
|
|
(Some(exact), Some(tolerant)) => {
|
|
|
|
let mut both = exact;
|
|
|
|
both |= tolerant;
|
|
|
|
Some(both)
|
|
|
|
}
|
|
|
|
})
|
|
|
|
}
|
|
|
|
Word::Derived(prefix) => self.get_db_word_prefix_docids(prefix),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-03-08 20:26:29 +08:00
|
|
|
/// Retrieve or insert the given value in the `word_prefix_docids` database.
|
2023-04-12 04:06:10 +08:00
|
|
|
fn get_db_word_prefix_docids(
|
2023-03-08 20:26:29 +08:00
|
|
|
&mut self,
|
|
|
|
prefix: Interned<String>,
|
2023-04-11 21:26:47 +08:00
|
|
|
) -> Result<Option<RoaringBitmap>> {
|
2023-03-30 17:08:17 +08:00
|
|
|
DatabaseCache::get_value(
|
|
|
|
self.txn,
|
2023-03-08 20:26:29 +08:00
|
|
|
prefix,
|
2023-03-30 17:08:17 +08:00
|
|
|
self.word_interner.get(prefix).as_str(),
|
|
|
|
&mut self.db_cache.word_prefix_docids,
|
|
|
|
self.index.word_prefix_docids.remap_data_type::<ByteSlice>(),
|
2023-04-11 21:26:47 +08:00
|
|
|
)?
|
|
|
|
.map(|bytes| RoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into()))
|
|
|
|
.transpose()
|
2023-02-21 16:43:23 +08:00
|
|
|
}
|
|
|
|
|
2023-04-12 04:04:38 +08:00
|
|
|
fn get_db_exact_word_prefix_docids(
|
|
|
|
&mut self,
|
|
|
|
prefix: Interned<String>,
|
|
|
|
) -> Result<Option<RoaringBitmap>> {
|
|
|
|
DatabaseCache::get_value(
|
|
|
|
self.txn,
|
|
|
|
prefix,
|
|
|
|
self.word_interner.get(prefix).as_str(),
|
|
|
|
&mut self.db_cache.exact_word_prefix_docids,
|
|
|
|
self.index.exact_word_prefix_docids.remap_data_type::<ByteSlice>(),
|
|
|
|
)?
|
|
|
|
.map(|bytes| RoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into()))
|
|
|
|
.transpose()
|
|
|
|
}
|
|
|
|
|
2023-03-30 17:08:17 +08:00
|
|
|
pub fn get_db_word_pair_proximity_docids(
|
2023-02-21 16:43:23 +08:00
|
|
|
&mut self,
|
2023-03-07 02:21:55 +08:00
|
|
|
word1: Interned<String>,
|
|
|
|
word2: Interned<String>,
|
2023-02-21 16:43:23 +08:00
|
|
|
proximity: u8,
|
2023-04-11 21:26:47 +08:00
|
|
|
) -> Result<Option<RoaringBitmap>> {
|
2023-03-30 17:08:17 +08:00
|
|
|
DatabaseCache::get_value(
|
|
|
|
self.txn,
|
2023-03-08 20:26:29 +08:00
|
|
|
(proximity, word1, word2),
|
2023-03-30 17:08:17 +08:00
|
|
|
&(
|
|
|
|
proximity,
|
|
|
|
self.word_interner.get(word1).as_str(),
|
|
|
|
self.word_interner.get(word2).as_str(),
|
|
|
|
),
|
|
|
|
&mut self.db_cache.word_pair_proximity_docids,
|
|
|
|
self.index.word_pair_proximity_docids.remap_data_type::<ByteSlice>(),
|
2023-04-11 21:26:47 +08:00
|
|
|
)?
|
|
|
|
.map(|bytes| CboRoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into()))
|
|
|
|
.transpose()
|
|
|
|
}
|
|
|
|
|
|
|
|
pub fn get_db_word_pair_proximity_docids_len(
|
|
|
|
&mut self,
|
|
|
|
word1: Interned<String>,
|
|
|
|
word2: Interned<String>,
|
|
|
|
proximity: u8,
|
|
|
|
) -> Result<Option<u64>> {
|
|
|
|
DatabaseCache::get_value(
|
|
|
|
self.txn,
|
|
|
|
(proximity, word1, word2),
|
|
|
|
&(
|
|
|
|
proximity,
|
|
|
|
self.word_interner.get(word1).as_str(),
|
|
|
|
self.word_interner.get(word2).as_str(),
|
|
|
|
),
|
|
|
|
&mut self.db_cache.word_pair_proximity_docids,
|
|
|
|
self.index.word_pair_proximity_docids.remap_data_type::<ByteSlice>(),
|
|
|
|
)?
|
|
|
|
.map(|bytes| {
|
|
|
|
CboRoaringBitmapLenCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into())
|
|
|
|
})
|
|
|
|
.transpose()
|
2023-02-21 16:43:23 +08:00
|
|
|
}
|
|
|
|
|
2023-03-30 17:08:17 +08:00
|
|
|
pub fn get_db_word_prefix_pair_proximity_docids(
|
2023-02-21 16:43:23 +08:00
|
|
|
&mut self,
|
2023-03-07 02:21:55 +08:00
|
|
|
word1: Interned<String>,
|
|
|
|
prefix2: Interned<String>,
|
2023-02-21 16:43:23 +08:00
|
|
|
proximity: u8,
|
2023-04-11 21:26:47 +08:00
|
|
|
) -> Result<Option<RoaringBitmap>> {
|
2023-03-30 17:08:17 +08:00
|
|
|
DatabaseCache::get_value(
|
|
|
|
self.txn,
|
2023-03-08 20:26:29 +08:00
|
|
|
(proximity, word1, prefix2),
|
2023-03-30 17:08:17 +08:00
|
|
|
&(
|
|
|
|
proximity,
|
|
|
|
self.word_interner.get(word1).as_str(),
|
|
|
|
self.word_interner.get(prefix2).as_str(),
|
|
|
|
),
|
|
|
|
&mut self.db_cache.word_prefix_pair_proximity_docids,
|
|
|
|
self.index.word_prefix_pair_proximity_docids.remap_data_type::<ByteSlice>(),
|
2023-04-11 21:26:47 +08:00
|
|
|
)?
|
|
|
|
.map(|bytes| CboRoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into()))
|
|
|
|
.transpose()
|
2023-02-21 16:43:23 +08:00
|
|
|
}
|
2023-03-30 17:08:17 +08:00
|
|
|
pub fn get_db_prefix_word_pair_proximity_docids(
|
2023-02-21 22:10:22 +08:00
|
|
|
&mut self,
|
2023-03-07 02:21:55 +08:00
|
|
|
left_prefix: Interned<String>,
|
|
|
|
right: Interned<String>,
|
2023-02-21 22:10:22 +08:00
|
|
|
proximity: u8,
|
2023-04-11 21:26:47 +08:00
|
|
|
) -> Result<Option<RoaringBitmap>> {
|
2023-03-30 17:08:17 +08:00
|
|
|
DatabaseCache::get_value(
|
|
|
|
self.txn,
|
2023-03-08 20:26:29 +08:00
|
|
|
(proximity, left_prefix, right),
|
|
|
|
&(
|
|
|
|
proximity,
|
2023-03-30 17:08:17 +08:00
|
|
|
self.word_interner.get(left_prefix).as_str(),
|
|
|
|
self.word_interner.get(right).as_str(),
|
2023-03-08 20:26:29 +08:00
|
|
|
),
|
2023-03-30 17:08:17 +08:00
|
|
|
&mut self.db_cache.prefix_word_pair_proximity_docids,
|
|
|
|
self.index.prefix_word_pair_proximity_docids.remap_data_type::<ByteSlice>(),
|
2023-04-11 21:26:47 +08:00
|
|
|
)?
|
|
|
|
.map(|bytes| CboRoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into()))
|
|
|
|
.transpose()
|
2023-02-21 22:10:22 +08:00
|
|
|
}
|
2023-04-04 23:04:46 +08:00
|
|
|
|
|
|
|
pub fn get_db_word_position_docids(
|
|
|
|
&mut self,
|
|
|
|
word: Interned<String>,
|
|
|
|
position: u16,
|
2023-04-11 21:26:47 +08:00
|
|
|
) -> Result<Option<RoaringBitmap>> {
|
2023-04-04 23:04:46 +08:00
|
|
|
DatabaseCache::get_value(
|
|
|
|
self.txn,
|
|
|
|
(word, position),
|
|
|
|
&(self.word_interner.get(word).as_str(), position),
|
|
|
|
&mut self.db_cache.word_position_docids,
|
|
|
|
self.index.word_position_docids.remap_data_type::<ByteSlice>(),
|
2023-04-11 21:26:47 +08:00
|
|
|
)?
|
|
|
|
.map(|bytes| CboRoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into()))
|
|
|
|
.transpose()
|
2023-04-04 23:04:46 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
pub fn get_db_word_fid_docids(
|
|
|
|
&mut self,
|
|
|
|
word: Interned<String>,
|
|
|
|
fid: u16,
|
2023-04-11 21:26:47 +08:00
|
|
|
) -> Result<Option<RoaringBitmap>> {
|
2023-04-04 23:04:46 +08:00
|
|
|
DatabaseCache::get_value(
|
|
|
|
self.txn,
|
|
|
|
(word, fid),
|
|
|
|
&(self.word_interner.get(word).as_str(), fid),
|
|
|
|
&mut self.db_cache.word_fid_docids,
|
|
|
|
self.index.word_fid_docids.remap_data_type::<ByteSlice>(),
|
2023-04-11 21:26:47 +08:00
|
|
|
)?
|
|
|
|
.map(|bytes| CboRoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into()))
|
|
|
|
.transpose()
|
2023-04-04 23:04:46 +08:00
|
|
|
}
|
2023-04-12 17:40:44 +08:00
|
|
|
|
|
|
|
pub fn get_db_word_prefix_fid_docids(
|
|
|
|
&mut self,
|
|
|
|
word_prefix: Interned<String>,
|
|
|
|
fid: u16,
|
2023-04-12 22:53:11 +08:00
|
|
|
) -> Result<Option<RoaringBitmap>> {
|
2023-04-12 17:40:44 +08:00
|
|
|
DatabaseCache::get_value(
|
|
|
|
self.txn,
|
|
|
|
(word_prefix, fid),
|
|
|
|
&(self.word_interner.get(word_prefix).as_str(), fid),
|
|
|
|
&mut self.db_cache.word_prefix_fid_docids,
|
|
|
|
self.index.word_prefix_fid_docids.remap_data_type::<ByteSlice>(),
|
2023-04-12 22:53:11 +08:00
|
|
|
)?
|
|
|
|
.map(|bytes| CboRoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into()))
|
|
|
|
.transpose()
|
2023-04-12 17:40:44 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
pub fn get_db_word_fids(&mut self, word: Interned<String>) -> Result<Vec<u16>> {
|
|
|
|
let fids = match self.db_cache.word_fids.entry(word) {
|
|
|
|
Entry::Occupied(fids) => fids.get().clone(),
|
|
|
|
Entry::Vacant(entry) => {
|
|
|
|
let key = self.word_interner.get(word).as_bytes();
|
|
|
|
let mut fids = vec![];
|
|
|
|
let remap_key_type = self
|
|
|
|
.index
|
|
|
|
.word_fid_docids
|
|
|
|
.remap_types::<ByteSlice, ByteSlice>()
|
|
|
|
.prefix_iter(self.txn, key)?
|
|
|
|
.remap_key_type::<StrBEU16Codec>();
|
|
|
|
for result in remap_key_type {
|
|
|
|
let ((_, fid), value) = result?;
|
|
|
|
// filling other caches to avoid searching for them again
|
|
|
|
self.db_cache.word_fid_docids.insert((word, fid), Some(value));
|
|
|
|
fids.push(fid);
|
|
|
|
}
|
|
|
|
entry.insert(fids.clone());
|
|
|
|
fids
|
|
|
|
}
|
|
|
|
};
|
|
|
|
Ok(fids)
|
|
|
|
}
|
|
|
|
|
|
|
|
pub fn get_db_word_prefix_fids(&mut self, word_prefix: Interned<String>) -> Result<Vec<u16>> {
|
|
|
|
let fids = match self.db_cache.word_prefix_fids.entry(word_prefix) {
|
|
|
|
Entry::Occupied(fids) => fids.get().clone(),
|
|
|
|
Entry::Vacant(entry) => {
|
|
|
|
let key = self.word_interner.get(word_prefix).as_bytes();
|
|
|
|
let mut fids = vec![];
|
|
|
|
let remap_key_type = self
|
|
|
|
.index
|
|
|
|
.word_prefix_fid_docids
|
|
|
|
.remap_types::<ByteSlice, ByteSlice>()
|
|
|
|
.prefix_iter(self.txn, key)?
|
|
|
|
.remap_key_type::<StrBEU16Codec>();
|
|
|
|
for result in remap_key_type {
|
|
|
|
let ((_, fid), value) = result?;
|
|
|
|
// filling other caches to avoid searching for them again
|
|
|
|
self.db_cache.word_prefix_fid_docids.insert((word_prefix, fid), Some(value));
|
|
|
|
fids.push(fid);
|
|
|
|
}
|
|
|
|
entry.insert(fids.clone());
|
|
|
|
fids
|
|
|
|
}
|
|
|
|
};
|
|
|
|
Ok(fids)
|
|
|
|
}
|
2023-02-21 16:43:23 +08:00
|
|
|
}
|