Update resolve_graph module to work with lazy query terms

This commit is contained in:
Loïc Lecrenier 2023-03-30 11:10:38 +02:00
parent d0f048c068
commit b96a682f16
3 changed files with 61 additions and 124 deletions

View File

@ -28,12 +28,13 @@ pub use logger::{DefaultSearchLogger, SearchLogger};
use query_graph::{QueryGraph, QueryNode, QueryNodeData}; use query_graph::{QueryGraph, QueryNode, QueryNodeData};
use query_term::{located_query_terms_from_string, Phrase, QueryTerm}; use query_term::{located_query_terms_from_string, Phrase, QueryTerm};
use ranking_rules::{bucket_sort, PlaceholderQuery, RankingRuleOutput, RankingRuleQueryTrait}; use ranking_rules::{bucket_sort, PlaceholderQuery, RankingRuleOutput, RankingRuleQueryTrait};
use resolve_query_graph::{resolve_query_graph, QueryTermDocIdsCache}; use resolve_query_graph::PhraseDocIdsCache;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use words::Words; use words::Words;
use self::interner::Interner; use self::interner::Interner;
use self::ranking_rules::{BoxRankingRule, RankingRule}; use self::ranking_rules::{BoxRankingRule, RankingRule};
use self::resolve_query_graph::compute_query_graph_docids;
use self::sort::Sort; use self::sort::Sort;
use crate::{ use crate::{
AscDesc, Filter, Index, MatchingWords, Member, Result, SearchResult, TermsMatchingStrategy, AscDesc, Filter, Index, MatchingWords, Member, Result, SearchResult, TermsMatchingStrategy,
@ -48,8 +49,7 @@ pub struct SearchContext<'ctx> {
pub word_interner: DedupInterner<String>, pub word_interner: DedupInterner<String>,
pub phrase_interner: DedupInterner<Phrase>, pub phrase_interner: DedupInterner<Phrase>,
pub term_interner: Interner<QueryTerm>, pub term_interner: Interner<QueryTerm>,
// think about memory usage of that field (roaring bitmaps in a hashmap) pub phrase_docids: PhraseDocIdsCache,
pub term_docids: QueryTermDocIdsCache,
} }
impl<'ctx> SearchContext<'ctx> { impl<'ctx> SearchContext<'ctx> {
pub fn new(index: &'ctx Index, txn: &'ctx RoTxn<'ctx>) -> Self { pub fn new(index: &'ctx Index, txn: &'ctx RoTxn<'ctx>) -> Self {
@ -60,7 +60,7 @@ impl<'ctx> SearchContext<'ctx> {
word_interner: <_>::default(), word_interner: <_>::default(),
phrase_interner: <_>::default(), phrase_interner: <_>::default(),
term_interner: <_>::default(), term_interner: <_>::default(),
term_docids: <_>::default(), phrase_docids: <_>::default(),
} }
} }
} }
@ -103,7 +103,7 @@ fn resolve_maximally_reduced_query_graph(
} }
} }
logger.query_for_universe(&graph); logger.query_for_universe(&graph);
let docids = resolve_query_graph(ctx, &graph, universe)?; let docids = compute_query_graph_docids(ctx, &graph, universe)?;
Ok(docids) Ok(docids)
} }
@ -319,7 +319,7 @@ pub fn execute_search(
let tokens = tokenizer.tokenize(query); let tokens = tokenizer.tokenize(query);
let query_terms = located_query_terms_from_string(ctx, tokens, words_limit)?; let query_terms = located_query_terms_from_string(ctx, tokens, words_limit)?;
let graph = QueryGraph::from_query(ctx, query_terms)?; let graph = QueryGraph::from_query(ctx, &query_terms)?;
check_sort_criteria(ctx, sort_criteria.as_ref())?; check_sort_criteria(ctx, sort_criteria.as_ref())?;

View File

@ -3,106 +3,63 @@
use std::collections::VecDeque; use std::collections::VecDeque;
use fxhash::FxHashMap; use fxhash::FxHashMap;
use heed::{BytesDecode, RoTxn}; use heed::BytesDecode;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use super::db_cache::DatabaseCache; use super::interner::Interned;
use super::interner::{DedupInterner, Interned};
use super::query_graph::QueryNodeData; use super::query_graph::QueryNodeData;
use super::query_term::{Phrase, QueryTerm}; use super::query_term::{Phrase, QueryTermSubset};
use super::small_bitmap::SmallBitmap; use super::small_bitmap::SmallBitmap;
use super::{QueryGraph, SearchContext}; use super::{QueryGraph, SearchContext};
use crate::{CboRoaringBitmapCodec, Index, Result, RoaringBitmapCodec}; use crate::search::new::query_term::LocatedQueryTermSubset;
use crate::{CboRoaringBitmapCodec, Result, RoaringBitmapCodec};
#[derive(Default)] #[derive(Default)]
pub struct QueryTermDocIdsCache { pub struct PhraseDocIdsCache {
pub phrases: FxHashMap<Interned<Phrase>, RoaringBitmap>, pub cache: FxHashMap<Interned<Phrase>, RoaringBitmap>,
pub terms: FxHashMap<Interned<QueryTerm>, RoaringBitmap>,
} }
impl QueryTermDocIdsCache { impl<'ctx> SearchContext<'ctx> {
/// Get the document ids associated with the given phrase /// Get the document ids associated with the given phrase
pub fn get_phrase_docids<'s, 'ctx>( pub fn get_phrase_docids(&mut self, phrase: Interned<Phrase>) -> Result<&RoaringBitmap> {
&'s mut self, if self.phrase_docids.cache.contains_key(&phrase) {
index: &Index, return Ok(&self.phrase_docids.cache[&phrase]);
txn: &'ctx RoTxn,
db_cache: &mut DatabaseCache<'ctx>,
word_interner: &DedupInterner<String>,
phrase_interner: &DedupInterner<Phrase>,
phrase: Interned<Phrase>,
) -> Result<&'s RoaringBitmap> {
if self.phrases.contains_key(&phrase) {
return Ok(&self.phrases[&phrase]);
}; };
let docids = resolve_phrase(index, txn, db_cache, word_interner, phrase_interner, phrase)?; let docids = compute_phrase_docids(self, phrase)?;
let _ = self.phrases.insert(phrase, docids); let _ = self.phrase_docids.cache.insert(phrase, docids);
let docids = &self.phrases[&phrase]; let docids = &self.phrase_docids.cache[&phrase];
Ok(docids) Ok(docids)
} }
/// Get the document ids associated with the given term }
pub fn get_query_term_docids<'s, 'ctx>( pub fn compute_query_term_subset_docids(
&'s mut self, ctx: &mut SearchContext,
index: &Index, term: &QueryTermSubset,
txn: &'ctx RoTxn, ) -> Result<RoaringBitmap> {
db_cache: &mut DatabaseCache<'ctx>,
word_interner: &DedupInterner<String>,
term_interner: &DedupInterner<QueryTerm>,
phrase_interner: &DedupInterner<Phrase>,
term_interned: Interned<QueryTerm>,
) -> Result<&'s RoaringBitmap> {
if self.terms.contains_key(&term_interned) {
return Ok(&self.terms[&term_interned]);
};
let mut docids = RoaringBitmap::new(); let mut docids = RoaringBitmap::new();
// TODO: use a MultiOps? for word in term.all_single_words_except_prefix_db(ctx)? {
let term = term_interner.get(term_interned); if let Some(word_docids) = ctx.get_db_word_docids(word)? {
for word in term.all_single_words_except_prefix_db() { docids |= RoaringBitmapCodec::bytes_decode(word_docids).ok_or(heed::Error::Decoding)?;
if let Some(word_docids) = db_cache.get_word_docids(index, txn, word_interner, word)? {
docids |=
RoaringBitmapCodec::bytes_decode(word_docids).ok_or(heed::Error::Decoding)?;
} }
} }
for phrase in term.all_phrases() { for phrase in term.all_phrases(ctx)? {
docids |= self.get_phrase_docids( docids |= ctx.get_phrase_docids(phrase)?;
index,
txn,
db_cache,
word_interner,
phrase_interner,
phrase,
)?;
} }
if let Some(prefix) = term.use_prefix_db { if let Some(prefix) = term.use_prefix_db(ctx) {
if let Some(prefix_docids) = if let Some(prefix_docids) = ctx.get_db_word_prefix_docids(prefix)? {
db_cache.get_word_prefix_docids(index, txn, word_interner, prefix)?
{
docids |= docids |=
RoaringBitmapCodec::bytes_decode(prefix_docids).ok_or(heed::Error::Decoding)?; RoaringBitmapCodec::bytes_decode(prefix_docids).ok_or(heed::Error::Decoding)?;
} }
} }
let _ = self.terms.insert(term_interned, docids);
let docids = &self.terms[&term_interned];
Ok(docids) Ok(docids)
}
} }
pub fn resolve_query_graph( pub fn compute_query_graph_docids(
ctx: &mut SearchContext, ctx: &mut SearchContext,
q: &QueryGraph, q: &QueryGraph,
universe: &RoaringBitmap, universe: &RoaringBitmap,
) -> Result<RoaringBitmap> { ) -> Result<RoaringBitmap> {
let SearchContext { // TODO: there must be a faster way to compute this big
index,
txn,
db_cache,
word_interner,
phrase_interner,
term_interner,
term_docids: query_term_docids,
..
} = ctx;
// TODO: there is a faster way to compute this big
// roaring bitmap expression // roaring bitmap expression
let mut nodes_resolved = SmallBitmap::for_interned_values_in(&q.nodes); let mut nodes_resolved = SmallBitmap::for_interned_values_in(&q.nodes);
@ -125,17 +82,13 @@ pub fn resolve_query_graph(
} }
let node_docids = match &node.data { let node_docids = match &node.data {
QueryNodeData::Term(located_term) => { QueryNodeData::Term(LocatedQueryTermSubset {
let term_docids = query_term_docids.get_query_term_docids( term_subset,
index, positions: _,
txn, term_ids: _,
db_cache, }) => {
word_interner, let phrase_docids = compute_query_term_subset_docids(ctx, term_subset)?;
term_interner, predecessors_docids & phrase_docids
phrase_interner,
located_term.value,
)?;
predecessors_docids & term_docids
} }
QueryNodeData::Deleted => { QueryNodeData::Deleted => {
panic!() panic!()
@ -163,15 +116,11 @@ pub fn resolve_query_graph(
panic!() panic!()
} }
pub fn resolve_phrase<'ctx>( pub fn compute_phrase_docids(
index: &Index, ctx: &mut SearchContext,
txn: &'ctx RoTxn,
db_cache: &mut DatabaseCache<'ctx>,
word_interner: &DedupInterner<String>,
phrase_interner: &DedupInterner<Phrase>,
phrase: Interned<Phrase>, phrase: Interned<Phrase>,
) -> Result<RoaringBitmap> { ) -> Result<RoaringBitmap> {
let Phrase { words } = phrase_interner.get(phrase).clone(); let Phrase { words } = ctx.phrase_interner.get(phrase).clone();
let mut candidates = RoaringBitmap::new(); let mut candidates = RoaringBitmap::new();
let mut first_iter = true; let mut first_iter = true;
let winsize = words.len().min(3); let winsize = words.len().min(3);
@ -195,14 +144,7 @@ pub fn resolve_phrase<'ctx>(
.filter_map(|(index, word)| word.as_ref().map(|word| (index, word))) .filter_map(|(index, word)| word.as_ref().map(|word| (index, word)))
{ {
if dist == 0 { if dist == 0 {
match db_cache.get_word_pair_proximity_docids( match ctx.get_db_word_pair_proximity_docids(s1, s2, 1)? {
index,
txn,
word_interner,
s1,
s2,
1,
)? {
Some(m) => bitmaps.push(CboRoaringBitmapCodec::deserialize_from(m)?), Some(m) => bitmaps.push(CboRoaringBitmapCodec::deserialize_from(m)?),
// If there are no documents for this pair, there will be no // If there are no documents for this pair, there will be no
// results for the phrase query. // results for the phrase query.
@ -211,14 +153,9 @@ pub fn resolve_phrase<'ctx>(
} else { } else {
let mut bitmap = RoaringBitmap::new(); let mut bitmap = RoaringBitmap::new();
for dist in 0..=dist { for dist in 0..=dist {
if let Some(m) = db_cache.get_word_pair_proximity_docids( if let Some(m) =
index, ctx.get_db_word_pair_proximity_docids(s1, s2, dist as u8 + 1)?
txn, {
word_interner,
s1,
s2,
dist as u8 + 1,
)? {
bitmap |= CboRoaringBitmapCodec::deserialize_from(m)?; bitmap |= CboRoaringBitmapCodec::deserialize_from(m)?;
} }
} }

View File

@ -4,7 +4,7 @@ use roaring::RoaringBitmap;
use super::logger::SearchLogger; use super::logger::SearchLogger;
use super::query_graph::QueryNodeData; use super::query_graph::QueryNodeData;
use super::resolve_query_graph::resolve_query_graph; use super::resolve_query_graph::compute_query_graph_docids;
use super::{QueryGraph, RankingRule, RankingRuleOutput, SearchContext}; use super::{QueryGraph, RankingRule, RankingRuleOutput, SearchContext};
use crate::{Result, TermsMatchingStrategy}; use crate::{Result, TermsMatchingStrategy};
@ -80,7 +80,7 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for Words {
logger.log_words_state(query_graph); logger.log_words_state(query_graph);
let this_bucket = resolve_query_graph(ctx, query_graph, universe)?; let this_bucket = compute_query_graph_docids(ctx, query_graph, universe)?;
let child_query_graph = query_graph.clone(); let child_query_graph = query_graph.clone();
loop { loop {