2023-03-09 18:12:31 +08:00
|
|
|
#![allow(clippy::too_many_arguments)]
|
|
|
|
|
2023-03-08 16:55:53 +08:00
|
|
|
use std::collections::VecDeque;
|
|
|
|
|
|
|
|
use fxhash::FxHashMap;
|
2023-03-09 18:12:31 +08:00
|
|
|
use heed::{BytesDecode, RoTxn};
|
2023-03-08 16:55:53 +08:00
|
|
|
use roaring::{MultiOps, RoaringBitmap};
|
|
|
|
|
2023-03-09 18:12:31 +08:00
|
|
|
use super::db_cache::DatabaseCache;
|
|
|
|
use super::interner::{Interned, Interner};
|
|
|
|
use super::query_graph::QUERY_GRAPH_NODE_LENGTH_LIMIT;
|
2023-03-07 02:21:55 +08:00
|
|
|
use super::query_term::{Phrase, QueryTerm, WordDerivations};
|
2023-03-07 21:42:58 +08:00
|
|
|
use super::small_bitmap::SmallBitmap;
|
2023-03-07 02:21:55 +08:00
|
|
|
use super::{QueryGraph, QueryNode, SearchContext};
|
2023-03-09 18:12:31 +08:00
|
|
|
use crate::{CboRoaringBitmapCodec, Index, Result, RoaringBitmapCodec};
|
2023-02-21 16:45:17 +08:00
|
|
|
|
|
|
|
#[derive(Default)]
|
2023-03-09 18:12:31 +08:00
|
|
|
pub struct QueryTermDocIdsCache {
|
|
|
|
pub phrases: FxHashMap<Interned<Phrase>, RoaringBitmap>,
|
|
|
|
pub derivations: FxHashMap<Interned<WordDerivations>, RoaringBitmap>,
|
2023-02-21 20:21:41 +08:00
|
|
|
}
|
2023-03-09 18:12:31 +08:00
|
|
|
impl QueryTermDocIdsCache {
|
|
|
|
/// Get the document ids associated with the given phrase
|
2023-03-13 21:03:48 +08:00
|
|
|
pub fn get_phrase_docids<'s, 'ctx>(
|
2023-03-09 18:12:31 +08:00
|
|
|
&'s mut self,
|
|
|
|
index: &Index,
|
2023-03-13 21:03:48 +08:00
|
|
|
txn: &'ctx RoTxn,
|
|
|
|
db_cache: &mut DatabaseCache<'ctx>,
|
2023-03-09 18:12:31 +08:00
|
|
|
word_interner: &Interner<String>,
|
|
|
|
phrase_interner: &Interner<Phrase>,
|
|
|
|
phrase: Interned<Phrase>,
|
|
|
|
) -> Result<&'s RoaringBitmap> {
|
|
|
|
if self.phrases.contains_key(&phrase) {
|
|
|
|
return Ok(&self.phrases[&phrase]);
|
2023-02-21 20:21:41 +08:00
|
|
|
};
|
2023-03-09 18:12:31 +08:00
|
|
|
let docids = resolve_phrase(index, txn, db_cache, word_interner, phrase_interner, phrase)?;
|
|
|
|
let _ = self.phrases.insert(phrase, docids);
|
|
|
|
let docids = &self.phrases[&phrase];
|
|
|
|
Ok(docids)
|
|
|
|
}
|
2023-03-03 04:27:57 +08:00
|
|
|
|
2023-03-09 18:12:31 +08:00
|
|
|
/// Get the document ids associated with the given word derivations
|
2023-03-13 21:03:48 +08:00
|
|
|
pub fn get_word_derivations_docids<'s, 'ctx>(
|
2023-03-09 18:12:31 +08:00
|
|
|
&'s mut self,
|
|
|
|
index: &Index,
|
2023-03-13 21:03:48 +08:00
|
|
|
txn: &'ctx RoTxn,
|
|
|
|
db_cache: &mut DatabaseCache<'ctx>,
|
2023-03-09 18:12:31 +08:00
|
|
|
word_interner: &Interner<String>,
|
|
|
|
derivations_interner: &Interner<WordDerivations>,
|
|
|
|
phrase_interner: &Interner<Phrase>,
|
|
|
|
derivations: Interned<WordDerivations>,
|
|
|
|
) -> Result<&'s RoaringBitmap> {
|
|
|
|
if self.derivations.contains_key(&derivations) {
|
|
|
|
return Ok(&self.derivations[&derivations]);
|
2023-02-21 20:21:41 +08:00
|
|
|
};
|
2023-03-09 18:12:31 +08:00
|
|
|
let WordDerivations {
|
2023-03-14 00:21:29 +08:00
|
|
|
original: _,
|
|
|
|
is_prefix: _,
|
|
|
|
zero_typo,
|
|
|
|
prefix_of,
|
2023-03-09 18:12:31 +08:00
|
|
|
synonyms,
|
|
|
|
split_words,
|
|
|
|
one_typo,
|
|
|
|
two_typos,
|
|
|
|
use_prefix_db,
|
|
|
|
} = derivations_interner.get(derivations);
|
|
|
|
let mut or_docids = vec![];
|
2023-03-14 00:21:29 +08:00
|
|
|
for word in zero_typo
|
|
|
|
.iter()
|
|
|
|
.chain(prefix_of.iter())
|
|
|
|
.chain(one_typo.iter())
|
|
|
|
.chain(two_typos.iter())
|
|
|
|
.copied()
|
|
|
|
{
|
2023-03-09 18:12:31 +08:00
|
|
|
if let Some(word_docids) = db_cache.get_word_docids(index, txn, word_interner, word)? {
|
|
|
|
or_docids.push(word_docids);
|
|
|
|
}
|
|
|
|
}
|
2023-03-14 00:21:29 +08:00
|
|
|
if let Some(prefix) = use_prefix_db {
|
2023-03-09 18:12:31 +08:00
|
|
|
if let Some(prefix_docids) =
|
2023-03-14 00:21:29 +08:00
|
|
|
db_cache.get_word_prefix_docids(index, txn, word_interner, *prefix)?
|
2023-03-09 18:12:31 +08:00
|
|
|
{
|
|
|
|
or_docids.push(prefix_docids);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
let mut docids = or_docids
|
|
|
|
.into_iter()
|
|
|
|
.map(|slice| RoaringBitmapCodec::bytes_decode(slice).unwrap())
|
|
|
|
.collect::<Vec<_>>();
|
|
|
|
for synonym in synonyms.iter().copied() {
|
|
|
|
// TODO: cache resolve_phrase?
|
|
|
|
docids.push(resolve_phrase(
|
|
|
|
index,
|
|
|
|
txn,
|
|
|
|
db_cache,
|
|
|
|
word_interner,
|
|
|
|
phrase_interner,
|
|
|
|
synonym,
|
|
|
|
)?);
|
|
|
|
}
|
|
|
|
if let Some(split_words) = split_words {
|
|
|
|
docids.push(resolve_phrase(
|
|
|
|
index,
|
|
|
|
txn,
|
|
|
|
db_cache,
|
|
|
|
word_interner,
|
|
|
|
phrase_interner,
|
|
|
|
*split_words,
|
|
|
|
)?);
|
|
|
|
}
|
|
|
|
|
|
|
|
let docids = MultiOps::union(docids);
|
|
|
|
let _ = self.derivations.insert(derivations, docids);
|
|
|
|
let docids = &self.derivations[&derivations];
|
2023-02-21 20:21:41 +08:00
|
|
|
Ok(docids)
|
|
|
|
}
|
2023-03-09 18:12:31 +08:00
|
|
|
|
|
|
|
/// Get the document ids associated with the given query term.
|
2023-03-13 21:03:48 +08:00
|
|
|
fn get_query_term_docids<'s, 'ctx>(
|
2023-03-09 18:12:31 +08:00
|
|
|
&'s mut self,
|
|
|
|
index: &Index,
|
2023-03-13 21:03:48 +08:00
|
|
|
txn: &'ctx RoTxn,
|
|
|
|
db_cache: &mut DatabaseCache<'ctx>,
|
2023-03-09 18:12:31 +08:00
|
|
|
word_interner: &Interner<String>,
|
|
|
|
derivations_interner: &Interner<WordDerivations>,
|
|
|
|
phrase_interner: &Interner<Phrase>,
|
|
|
|
term: &QueryTerm,
|
|
|
|
) -> Result<&'s RoaringBitmap> {
|
|
|
|
match *term {
|
|
|
|
QueryTerm::Phrase { phrase } => {
|
|
|
|
self.get_phrase_docids(index, txn, db_cache, word_interner, phrase_interner, phrase)
|
|
|
|
}
|
|
|
|
QueryTerm::Word { derivations } => self.get_word_derivations_docids(
|
|
|
|
index,
|
|
|
|
txn,
|
|
|
|
db_cache,
|
|
|
|
word_interner,
|
|
|
|
derivations_interner,
|
|
|
|
phrase_interner,
|
|
|
|
derivations,
|
|
|
|
),
|
|
|
|
}
|
|
|
|
}
|
2023-02-21 16:45:17 +08:00
|
|
|
}
|
|
|
|
|
2023-03-13 21:03:48 +08:00
|
|
|
pub fn resolve_query_graph<'ctx>(
|
|
|
|
ctx: &mut SearchContext<'ctx>,
|
2023-02-21 16:45:17 +08:00
|
|
|
q: &QueryGraph,
|
|
|
|
universe: &RoaringBitmap,
|
|
|
|
) -> Result<RoaringBitmap> {
|
2023-03-09 18:12:31 +08:00
|
|
|
let SearchContext {
|
|
|
|
index,
|
|
|
|
txn,
|
|
|
|
db_cache,
|
|
|
|
word_interner,
|
|
|
|
phrase_interner,
|
|
|
|
derivations_interner,
|
|
|
|
query_term_docids,
|
2023-03-09 22:53:59 +08:00
|
|
|
..
|
2023-03-09 18:12:31 +08:00
|
|
|
} = ctx;
|
|
|
|
// TODO: there is a faster way to compute this big
|
2023-02-21 16:45:17 +08:00
|
|
|
// roaring bitmap expression
|
|
|
|
|
2023-03-09 18:12:31 +08:00
|
|
|
let mut nodes_resolved = SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT);
|
2023-02-21 20:21:41 +08:00
|
|
|
let mut path_nodes_docids = vec![RoaringBitmap::new(); q.nodes.len()];
|
2023-02-21 16:45:17 +08:00
|
|
|
|
|
|
|
let mut next_nodes_to_visit = VecDeque::new();
|
2023-03-09 18:12:31 +08:00
|
|
|
next_nodes_to_visit.push_back(q.root_node);
|
2023-02-21 16:45:17 +08:00
|
|
|
|
|
|
|
while let Some(node) = next_nodes_to_visit.pop_front() {
|
2023-02-21 19:55:44 +08:00
|
|
|
let predecessors = &q.edges[node as usize].predecessors;
|
2023-02-21 16:45:17 +08:00
|
|
|
if !predecessors.is_subset(&nodes_resolved) {
|
|
|
|
next_nodes_to_visit.push_back(node);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
// Take union of all predecessors
|
2023-03-07 21:42:58 +08:00
|
|
|
let mut predecessors_docids = RoaringBitmap::new();
|
|
|
|
for p in predecessors.iter() {
|
|
|
|
predecessors_docids |= &path_nodes_docids[p as usize];
|
|
|
|
}
|
2023-02-21 16:45:17 +08:00
|
|
|
|
2023-02-21 19:55:44 +08:00
|
|
|
let n = &q.nodes[node as usize];
|
2023-03-03 04:27:57 +08:00
|
|
|
|
2023-02-21 16:45:17 +08:00
|
|
|
let node_docids = match n {
|
2023-03-03 04:27:57 +08:00
|
|
|
QueryNode::Term(located_term) => {
|
2023-03-09 18:12:31 +08:00
|
|
|
let derivations_docids = query_term_docids.get_query_term_docids(
|
|
|
|
index,
|
|
|
|
txn,
|
|
|
|
db_cache,
|
|
|
|
word_interner,
|
|
|
|
derivations_interner,
|
|
|
|
phrase_interner,
|
|
|
|
&located_term.value,
|
|
|
|
)?;
|
2023-02-21 20:21:41 +08:00
|
|
|
predecessors_docids & derivations_docids
|
2023-02-21 16:45:17 +08:00
|
|
|
}
|
2023-03-03 04:27:57 +08:00
|
|
|
QueryNode::Deleted => {
|
2023-02-21 20:21:41 +08:00
|
|
|
panic!()
|
2023-02-21 16:45:17 +08:00
|
|
|
}
|
2023-03-03 04:27:57 +08:00
|
|
|
QueryNode::Start => universe.clone(),
|
|
|
|
QueryNode::End => {
|
2023-02-21 16:45:17 +08:00
|
|
|
return Ok(predecessors_docids);
|
|
|
|
}
|
|
|
|
};
|
2023-02-21 19:55:44 +08:00
|
|
|
nodes_resolved.insert(node);
|
2023-02-21 20:21:41 +08:00
|
|
|
path_nodes_docids[node as usize] = node_docids;
|
2023-02-21 16:45:17 +08:00
|
|
|
|
2023-02-21 19:55:44 +08:00
|
|
|
for succ in q.edges[node as usize].successors.iter() {
|
|
|
|
if !next_nodes_to_visit.contains(&succ) && !nodes_resolved.contains(succ) {
|
|
|
|
next_nodes_to_visit.push_back(succ);
|
2023-02-21 16:45:17 +08:00
|
|
|
}
|
|
|
|
}
|
2023-02-21 20:57:34 +08:00
|
|
|
|
2023-02-21 19:55:44 +08:00
|
|
|
for prec in q.edges[node as usize].predecessors.iter() {
|
2023-02-21 19:33:32 +08:00
|
|
|
if q.edges[prec as usize].successors.is_subset(&nodes_resolved) {
|
2023-02-21 20:21:41 +08:00
|
|
|
path_nodes_docids[prec as usize].clear();
|
2023-02-21 16:45:17 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
panic!()
|
|
|
|
}
|
2023-03-03 04:27:57 +08:00
|
|
|
|
2023-03-13 21:03:48 +08:00
|
|
|
pub fn resolve_phrase<'ctx>(
|
2023-03-09 18:12:31 +08:00
|
|
|
index: &Index,
|
2023-03-13 21:03:48 +08:00
|
|
|
txn: &'ctx RoTxn,
|
|
|
|
db_cache: &mut DatabaseCache<'ctx>,
|
2023-03-09 18:12:31 +08:00
|
|
|
word_interner: &Interner<String>,
|
|
|
|
phrase_interner: &Interner<Phrase>,
|
|
|
|
phrase: Interned<Phrase>,
|
|
|
|
) -> Result<RoaringBitmap> {
|
|
|
|
let Phrase { words } = phrase_interner.get(phrase).clone();
|
2023-03-03 04:27:57 +08:00
|
|
|
let mut candidates = RoaringBitmap::new();
|
|
|
|
let mut first_iter = true;
|
|
|
|
let winsize = words.len().min(3);
|
|
|
|
|
|
|
|
if words.is_empty() {
|
|
|
|
return Ok(candidates);
|
|
|
|
}
|
|
|
|
|
|
|
|
for win in words.windows(winsize) {
|
|
|
|
// Get all the documents with the matching distance for each word pairs.
|
|
|
|
let mut bitmaps = Vec::with_capacity(winsize.pow(2));
|
2023-03-07 02:21:55 +08:00
|
|
|
for (offset, &s1) in win
|
2023-03-03 04:27:57 +08:00
|
|
|
.iter()
|
|
|
|
.enumerate()
|
|
|
|
.filter_map(|(index, word)| word.as_ref().map(|word| (index, word)))
|
|
|
|
{
|
2023-03-07 02:21:55 +08:00
|
|
|
for (dist, &s2) in win
|
2023-03-03 04:27:57 +08:00
|
|
|
.iter()
|
|
|
|
.skip(offset + 1)
|
|
|
|
.enumerate()
|
|
|
|
.filter_map(|(index, word)| word.as_ref().map(|word| (index, word)))
|
|
|
|
{
|
|
|
|
if dist == 0 {
|
2023-03-09 18:12:31 +08:00
|
|
|
match db_cache.get_word_pair_proximity_docids(
|
|
|
|
index,
|
|
|
|
txn,
|
|
|
|
word_interner,
|
|
|
|
s1,
|
|
|
|
s2,
|
|
|
|
1,
|
|
|
|
)? {
|
2023-03-03 04:27:57 +08:00
|
|
|
Some(m) => bitmaps.push(CboRoaringBitmapCodec::deserialize_from(m)?),
|
|
|
|
// If there are no documents for this pair, there will be no
|
|
|
|
// results for the phrase query.
|
|
|
|
None => return Ok(RoaringBitmap::new()),
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
let mut bitmap = RoaringBitmap::new();
|
|
|
|
for dist in 0..=dist {
|
2023-03-09 18:12:31 +08:00
|
|
|
if let Some(m) = db_cache.get_word_pair_proximity_docids(
|
|
|
|
index,
|
|
|
|
txn,
|
|
|
|
word_interner,
|
|
|
|
s1,
|
|
|
|
s2,
|
|
|
|
dist as u8 + 1,
|
|
|
|
)? {
|
2023-03-03 04:27:57 +08:00
|
|
|
bitmap |= CboRoaringBitmapCodec::deserialize_from(m)?;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if bitmap.is_empty() {
|
|
|
|
return Ok(bitmap);
|
|
|
|
} else {
|
|
|
|
bitmaps.push(bitmap);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// We sort the bitmaps so that we perform the small intersections first, which is faster.
|
|
|
|
bitmaps.sort_unstable_by_key(|a| a.len());
|
|
|
|
|
|
|
|
for bitmap in bitmaps {
|
|
|
|
if first_iter {
|
|
|
|
candidates = bitmap;
|
|
|
|
first_iter = false;
|
|
|
|
} else {
|
|
|
|
candidates &= bitmap;
|
|
|
|
}
|
|
|
|
// There will be no match, return early
|
|
|
|
if candidates.is_empty() {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
Ok(candidates)
|
|
|
|
}
|