2023-03-06 19:21:55 +01:00
|
|
|
use super::{interner::Interned, SearchContext};
|
|
|
|
use crate::Result;
|
2023-02-21 12:55:44 +01:00
|
|
|
use fxhash::FxHashMap;
|
2023-02-21 13:57:34 +01:00
|
|
|
use heed::types::ByteSlice;
|
2023-03-06 19:21:55 +01:00
|
|
|
use std::collections::hash_map::Entry;
|
2023-02-21 09:43:23 +01:00
|
|
|
|
|
|
|
#[derive(Default)]
|
2023-03-06 19:21:55 +01:00
|
|
|
pub struct DatabaseCache<'search> {
|
2023-03-08 09:53:05 +01:00
|
|
|
// TODO: interner for all database cache keys?
|
2023-03-06 19:21:55 +01:00
|
|
|
pub word_pair_proximity_docids:
|
|
|
|
FxHashMap<(u8, Interned<String>, Interned<String>), Option<&'search [u8]>>,
|
2023-02-21 09:43:23 +01:00
|
|
|
pub word_prefix_pair_proximity_docids:
|
2023-03-06 19:21:55 +01:00
|
|
|
FxHashMap<(u8, Interned<String>, Interned<String>), Option<&'search [u8]>>,
|
2023-02-21 15:10:22 +01:00
|
|
|
pub prefix_word_pair_proximity_docids:
|
2023-03-06 19:21:55 +01:00
|
|
|
FxHashMap<(u8, Interned<String>, Interned<String>), Option<&'search [u8]>>,
|
|
|
|
pub word_docids: FxHashMap<Interned<String>, Option<&'search [u8]>>,
|
|
|
|
pub exact_word_docids: FxHashMap<Interned<String>, Option<&'search [u8]>>,
|
|
|
|
pub word_prefix_docids: FxHashMap<Interned<String>, Option<&'search [u8]>>,
|
2023-02-21 09:43:23 +01:00
|
|
|
}
|
2023-03-06 19:21:55 +01:00
|
|
|
impl<'search> SearchContext<'search> {
|
|
|
|
pub fn get_word_docids(&mut self, word: Interned<String>) -> Result<Option<&'search [u8]>> {
|
|
|
|
let bitmap_ptr = match self.db_cache.word_docids.entry(word) {
|
2023-02-21 09:43:23 +01:00
|
|
|
Entry::Occupied(bitmap_ptr) => *bitmap_ptr.get(),
|
|
|
|
Entry::Vacant(entry) => {
|
2023-03-06 19:21:55 +01:00
|
|
|
let bitmap_ptr = self
|
|
|
|
.index
|
|
|
|
.word_docids
|
|
|
|
.remap_data_type::<ByteSlice>()
|
|
|
|
.get(self.txn, self.word_interner.get(word))?;
|
2023-02-21 09:43:23 +01:00
|
|
|
entry.insert(bitmap_ptr);
|
|
|
|
bitmap_ptr
|
|
|
|
}
|
|
|
|
};
|
|
|
|
Ok(bitmap_ptr)
|
|
|
|
}
|
2023-03-06 19:21:55 +01:00
|
|
|
pub fn get_prefix_docids(&mut self, prefix: Interned<String>) -> Result<Option<&'search [u8]>> {
|
2023-02-21 09:43:23 +01:00
|
|
|
// In the future, this will be a frozen roaring bitmap
|
2023-03-06 19:21:55 +01:00
|
|
|
let bitmap_ptr = match self.db_cache.word_prefix_docids.entry(prefix) {
|
2023-02-21 09:43:23 +01:00
|
|
|
Entry::Occupied(bitmap_ptr) => *bitmap_ptr.get(),
|
|
|
|
Entry::Vacant(entry) => {
|
2023-03-06 19:21:55 +01:00
|
|
|
let bitmap_ptr = self
|
|
|
|
.index
|
|
|
|
.word_prefix_docids
|
|
|
|
.remap_data_type::<ByteSlice>()
|
|
|
|
.get(self.txn, self.word_interner.get(prefix))?;
|
2023-02-21 09:43:23 +01:00
|
|
|
entry.insert(bitmap_ptr);
|
|
|
|
bitmap_ptr
|
|
|
|
}
|
|
|
|
};
|
|
|
|
Ok(bitmap_ptr)
|
|
|
|
}
|
|
|
|
|
|
|
|
pub fn get_word_pair_proximity_docids(
|
|
|
|
&mut self,
|
2023-03-06 19:21:55 +01:00
|
|
|
word1: Interned<String>,
|
|
|
|
word2: Interned<String>,
|
2023-02-21 09:43:23 +01:00
|
|
|
proximity: u8,
|
2023-03-06 19:21:55 +01:00
|
|
|
) -> Result<Option<&'search [u8]>> {
|
|
|
|
let key = (proximity, word1, word2);
|
|
|
|
match self.db_cache.word_pair_proximity_docids.entry(key) {
|
2023-02-21 09:43:23 +01:00
|
|
|
Entry::Occupied(bitmap_ptr) => Ok(*bitmap_ptr.get()),
|
|
|
|
Entry::Vacant(entry) => {
|
2023-02-21 13:57:34 +01:00
|
|
|
// We shouldn't greedily access this DB at all
|
2023-02-21 09:43:23 +01:00
|
|
|
// a DB (w1, w2) -> [proximities] would be much better
|
|
|
|
// We could even have a DB that is (w1) -> set of words such that (w1, w2) are in proximity
|
|
|
|
// And if we worked with words encoded as integers, the set of words could be a roaring bitmap
|
|
|
|
// Then, to find all the proximities between two list of words, we'd do:
|
|
|
|
|
|
|
|
// inputs:
|
|
|
|
// - words1 (roaring bitmap)
|
|
|
|
// - words2 (roaring bitmap)
|
|
|
|
// output:
|
|
|
|
// - [(word1, word2, [proximities])]
|
|
|
|
// algo:
|
|
|
|
// let mut ouput = vec![];
|
|
|
|
// for word1 in words1 {
|
|
|
|
// let all_words_in_proximity_of_w1 = pair_words_db.get(word1);
|
|
|
|
// let words_in_proximity_of_w1 = all_words_in_proximity_of_w1 & words2;
|
|
|
|
// for word2 in words_in_proximity_of_w1 {
|
|
|
|
// let proximties = prox_db.get(word1, word2);
|
|
|
|
// output.push(word1, word2, proximities);
|
|
|
|
// }
|
|
|
|
// }
|
2023-03-06 19:21:55 +01:00
|
|
|
let bitmap_ptr =
|
|
|
|
self.index.word_pair_proximity_docids.remap_data_type::<ByteSlice>().get(
|
|
|
|
self.txn,
|
|
|
|
&(key.0, self.word_interner.get(key.1), self.word_interner.get(key.2)),
|
|
|
|
)?;
|
2023-02-21 09:43:23 +01:00
|
|
|
entry.insert(bitmap_ptr);
|
|
|
|
Ok(bitmap_ptr)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
pub fn get_word_prefix_pair_proximity_docids(
|
|
|
|
&mut self,
|
2023-03-06 19:21:55 +01:00
|
|
|
word1: Interned<String>,
|
|
|
|
prefix2: Interned<String>,
|
2023-02-21 09:43:23 +01:00
|
|
|
proximity: u8,
|
2023-03-06 19:21:55 +01:00
|
|
|
) -> Result<Option<&'search [u8]>> {
|
|
|
|
let key = (proximity, word1, prefix2);
|
|
|
|
match self.db_cache.word_prefix_pair_proximity_docids.entry(key) {
|
2023-02-21 09:43:23 +01:00
|
|
|
Entry::Occupied(bitmap_ptr) => Ok(*bitmap_ptr.get()),
|
|
|
|
Entry::Vacant(entry) => {
|
2023-03-06 19:21:55 +01:00
|
|
|
let bitmap_ptr = self
|
|
|
|
.index
|
2023-02-21 09:43:23 +01:00
|
|
|
.word_prefix_pair_proximity_docids
|
|
|
|
.remap_data_type::<ByteSlice>()
|
2023-03-06 19:21:55 +01:00
|
|
|
.get(
|
|
|
|
self.txn,
|
|
|
|
&(key.0, self.word_interner.get(key.1), self.word_interner.get(key.2)),
|
|
|
|
)?;
|
2023-02-21 09:43:23 +01:00
|
|
|
entry.insert(bitmap_ptr);
|
|
|
|
Ok(bitmap_ptr)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2023-02-21 15:10:22 +01:00
|
|
|
pub fn get_prefix_word_pair_proximity_docids(
|
|
|
|
&mut self,
|
2023-03-06 19:21:55 +01:00
|
|
|
left_prefix: Interned<String>,
|
|
|
|
right: Interned<String>,
|
2023-02-21 15:10:22 +01:00
|
|
|
proximity: u8,
|
2023-03-06 19:21:55 +01:00
|
|
|
) -> Result<Option<&'search [u8]>> {
|
|
|
|
let key = (proximity, left_prefix, right);
|
|
|
|
match self.db_cache.prefix_word_pair_proximity_docids.entry(key) {
|
2023-02-21 15:10:22 +01:00
|
|
|
Entry::Occupied(bitmap_ptr) => Ok(*bitmap_ptr.get()),
|
|
|
|
Entry::Vacant(entry) => {
|
2023-03-06 19:21:55 +01:00
|
|
|
let bitmap_ptr = self
|
|
|
|
.index
|
2023-02-21 15:10:22 +01:00
|
|
|
.prefix_word_pair_proximity_docids
|
|
|
|
.remap_data_type::<ByteSlice>()
|
2023-03-06 19:21:55 +01:00
|
|
|
.get(
|
|
|
|
self.txn,
|
|
|
|
&(
|
|
|
|
proximity,
|
|
|
|
self.word_interner.get(left_prefix),
|
|
|
|
self.word_interner.get(right),
|
|
|
|
),
|
|
|
|
)?;
|
2023-02-21 15:10:22 +01:00
|
|
|
entry.insert(bitmap_ptr);
|
|
|
|
Ok(bitmap_ptr)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2023-02-21 09:43:23 +01:00
|
|
|
}
|