meilisearch/milli/src/search/new/resolve_query_graph.rs

199 lines
6.5 KiB
Rust
Raw Normal View History

2023-03-09 18:12:31 +08:00
#![allow(clippy::too_many_arguments)]
2023-03-08 16:55:53 +08:00
use std::collections::VecDeque;
use fxhash::FxHashMap;
use roaring::RoaringBitmap;
2023-03-08 16:55:53 +08:00
use super::interner::Interned;
2023-03-14 23:37:47 +08:00
use super::query_graph::QueryNodeData;
use super::query_term::{Phrase, QueryTermSubset};
use super::small_bitmap::SmallBitmap;
use super::{QueryGraph, SearchContext, Word};
use crate::search::new::query_term::LocatedQueryTermSubset;
2023-04-11 21:31:40 +08:00
use crate::Result;
#[derive(Default)]
pub struct PhraseDocIdsCache {
pub cache: FxHashMap<Interned<Phrase>, RoaringBitmap>,
}
impl<'ctx> SearchContext<'ctx> {
2023-03-09 18:12:31 +08:00
/// Get the document ids associated with the given phrase
pub fn get_phrase_docids(&mut self, phrase: Interned<Phrase>) -> Result<&RoaringBitmap> {
if self.phrase_docids.cache.contains_key(&phrase) {
return Ok(&self.phrase_docids.cache[&phrase]);
};
let docids = compute_phrase_docids(self, phrase)?;
let _ = self.phrase_docids.cache.insert(phrase, docids);
let docids = &self.phrase_docids.cache[&phrase];
2023-03-09 18:12:31 +08:00
Ok(docids)
}
}
pub fn compute_query_term_subset_docids(
ctx: &mut SearchContext,
term: &QueryTermSubset,
) -> Result<RoaringBitmap> {
let mut docids = RoaringBitmap::new();
for word in term.all_single_words_except_prefix_db(ctx)? {
if let Some(word_docids) = ctx.word_docids(word)? {
2023-04-11 21:31:40 +08:00
docids |= word_docids;
2023-03-09 18:12:31 +08:00
}
}
for phrase in term.all_phrases(ctx)? {
docids |= ctx.get_phrase_docids(phrase)?;
}
2023-03-09 18:12:31 +08:00
if let Some(prefix) = term.use_prefix_db(ctx) {
if let Some(prefix_docids) = ctx.word_prefix_docids(prefix)? {
2023-04-11 21:31:40 +08:00
docids |= prefix_docids;
2023-03-09 18:12:31 +08:00
}
}
Ok(docids)
}
pub fn compute_query_graph_docids(
2023-03-23 16:15:57 +08:00
ctx: &mut SearchContext,
q: &QueryGraph,
universe: &RoaringBitmap,
) -> Result<RoaringBitmap> {
// TODO: there must be a faster way to compute this big
// roaring bitmap expression
2023-03-14 23:37:47 +08:00
let mut nodes_resolved = SmallBitmap::for_interned_values_in(&q.nodes);
let mut path_nodes_docids = q.nodes.map(|_| RoaringBitmap::new());
let mut next_nodes_to_visit = VecDeque::new();
2023-03-09 18:12:31 +08:00
next_nodes_to_visit.push_back(q.root_node);
2023-03-14 23:37:47 +08:00
while let Some(node_id) = next_nodes_to_visit.pop_front() {
let node = q.nodes.get(node_id);
let predecessors = &node.predecessors;
if !predecessors.is_subset(&nodes_resolved) {
2023-03-14 23:37:47 +08:00
next_nodes_to_visit.push_back(node_id);
continue;
}
// Take union of all predecessors
let mut predecessors_docids = RoaringBitmap::new();
for p in predecessors.iter() {
2023-03-14 23:37:47 +08:00
predecessors_docids |= path_nodes_docids.get(p);
}
2023-03-14 23:37:47 +08:00
let node_docids = match &node.data {
QueryNodeData::Term(LocatedQueryTermSubset {
term_subset,
positions: _,
term_ids: _,
}) => {
let phrase_docids = compute_query_term_subset_docids(ctx, term_subset)?;
predecessors_docids & phrase_docids
}
2023-03-14 23:37:47 +08:00
QueryNodeData::Deleted => {
panic!()
}
2023-03-14 23:37:47 +08:00
QueryNodeData::Start => universe.clone(),
QueryNodeData::End => {
return Ok(predecessors_docids);
}
};
2023-03-14 23:37:47 +08:00
nodes_resolved.insert(node_id);
*path_nodes_docids.get_mut(node_id) = node_docids;
2023-03-14 23:37:47 +08:00
for succ in node.successors.iter() {
if !next_nodes_to_visit.contains(&succ) && !nodes_resolved.contains(succ) {
next_nodes_to_visit.push_back(succ);
}
}
2023-02-21 20:57:34 +08:00
2023-03-14 23:37:47 +08:00
for prec in node.predecessors.iter() {
if q.nodes.get(prec).successors.is_subset(&nodes_resolved) {
path_nodes_docids.get_mut(prec).clear();
}
}
}
panic!()
}
pub fn compute_phrase_docids(
ctx: &mut SearchContext,
2023-03-09 18:12:31 +08:00
phrase: Interned<Phrase>,
) -> Result<RoaringBitmap> {
let Phrase { words } = ctx.phrase_interner.get(phrase).clone();
if words.is_empty() {
return Ok(RoaringBitmap::new());
}
if words.len() == 1 {
if let Some(word) = &words[0] {
if let Some(word_docids) = ctx.word_docids(Word::Original(*word))? {
2023-04-11 21:31:40 +08:00
return Ok(word_docids);
} else {
return Ok(RoaringBitmap::new());
}
} else {
return Ok(RoaringBitmap::new());
}
}
let mut candidates = RoaringBitmap::new();
let mut first_iter = true;
let winsize = words.len().min(3);
for win in words.windows(winsize) {
// Get all the documents with the matching distance for each word pairs.
let mut bitmaps = Vec::with_capacity(winsize.pow(2));
for (offset, &s1) in win
.iter()
.enumerate()
.filter_map(|(index, word)| word.as_ref().map(|word| (index, word)))
{
for (dist, &s2) in win
.iter()
.skip(offset + 1)
.enumerate()
.filter_map(|(index, word)| word.as_ref().map(|word| (index, word)))
{
if dist == 0 {
match ctx.get_db_word_pair_proximity_docids(s1, s2, 1)? {
2023-04-11 21:31:40 +08:00
Some(m) => bitmaps.push(m),
// If there are no documents for this pair, there will be no
// results for the phrase query.
None => return Ok(RoaringBitmap::new()),
}
} else {
let mut bitmap = RoaringBitmap::new();
for dist in 0..=dist {
if let Some(m) =
ctx.get_db_word_pair_proximity_docids(s1, s2, dist as u8 + 1)?
{
2023-04-11 21:31:40 +08:00
bitmap |= m;
}
}
if bitmap.is_empty() {
return Ok(bitmap);
} else {
bitmaps.push(bitmap);
}
}
}
}
// We sort the bitmaps so that we perform the small intersections first, which is faster.
bitmaps.sort_unstable_by_key(|a| a.len());
for bitmap in bitmaps {
if first_iter {
candidates = bitmap;
first_iter = false;
} else {
candidates &= bitmap;
}
// There will be no match, return early
if candidates.is_empty() {
break;
}
}
}
Ok(candidates)
}