Exact attribute with state

This commit is contained in:
Louis Dureuil 2023-04-05 14:42:51 +02:00 committed by Louis Dureuil
parent 8a13ed7e3f
commit f513cf930a

View File

@ -1,5 +1,5 @@
use heed::BytesDecode; use heed::BytesDecode;
use roaring::MultiOps; use roaring::{MultiOps, RoaringBitmap};
use super::query_graph::QueryGraph; use super::query_graph::QueryGraph;
use super::ranking_rules::{RankingRule, RankingRuleOutput}; use super::ranking_rules::{RankingRule, RankingRuleOutput};
@ -7,19 +7,18 @@ use crate::search::new::query_graph::QueryNodeData;
use crate::search::new::query_term::ExactTerm; use crate::search::new::query_term::ExactTerm;
use crate::{CboRoaringBitmapCodec, Result, SearchContext, SearchLogger}; use crate::{CboRoaringBitmapCodec, Result, SearchContext, SearchLogger};
/// FIXME: /// A ranking rule that produces 3 disjoint buckets:
/// ///
/// - A lot of work done in next_bucket that start_iteration could do. /// 1. Documents from the universe whose value is exactly the query.
/// - Consider calling the graph based rule directly from this one. /// 2. Documents from the universe not in (1) whose value starts with the query.
/// - currently we did exact term, don't forget about prefix /// 3. Documents from the universe not in (1) or (2).
/// - some tests
pub struct ExactAttribute { pub struct ExactAttribute {
query_graph: Option<QueryGraph>, state: State,
} }
impl ExactAttribute { impl ExactAttribute {
pub fn new() -> Self { pub fn new() -> Self {
Self { query_graph: None } Self { state: Default::default() }
} }
} }
@ -30,23 +29,69 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for ExactAttribute {
fn start_iteration( fn start_iteration(
&mut self, &mut self,
_ctx: &mut SearchContext<'ctx>, ctx: &mut SearchContext<'ctx>,
_logger: &mut dyn SearchLogger<QueryGraph>, _logger: &mut dyn SearchLogger<QueryGraph>,
_universe: &roaring::RoaringBitmap, universe: &roaring::RoaringBitmap,
query: &QueryGraph, query: &QueryGraph,
) -> Result<()> { ) -> Result<()> {
self.query_graph = Some(query.clone()); self.state = State::start_iteration(ctx, universe, query)?;
Ok(()) Ok(())
} }
fn next_bucket( fn next_bucket(
&mut self, &mut self,
ctx: &mut SearchContext<'ctx>, _ctx: &mut SearchContext<'ctx>,
_logger: &mut dyn SearchLogger<QueryGraph>, _logger: &mut dyn SearchLogger<QueryGraph>,
universe: &roaring::RoaringBitmap, universe: &roaring::RoaringBitmap,
) -> Result<Option<RankingRuleOutput<QueryGraph>>> { ) -> Result<Option<RankingRuleOutput<QueryGraph>>> {
// iterate on the nodes of the graph, retain LocatedQueryTermSubset let state = std::mem::take(&mut self.state);
let query_graph = self.query_graph.as_ref().unwrap(); let (state, output) = State::next(state, universe);
self.state = state;
Ok(output)
}
fn end_iteration(
&mut self,
_ctx: &mut SearchContext<'ctx>,
_logger: &mut dyn SearchLogger<QueryGraph>,
) {
self.state = Default::default();
}
}
/// Inner state of the ranking rule.
#[derive(Default)]
enum State {
/// State between two iterations
#[default]
Uninitialized,
/// The next call to `next` will output the documents in the universe that have an attribute that is the exact query
ExactAttribute(QueryGraph, Vec<FieldCandidates>),
/// The next call to `next` will output the documents in the universe that have an attribute that starts with the exact query,
/// but isn't the exact query.
AttributeStarts(QueryGraph, Vec<FieldCandidates>),
/// The next calls to `next` will output the input universe.
Empty(QueryGraph),
}
/// The candidates sorted by attributes
///
/// Each of the bitmap in a single `FieldCandidates` struct applies to the same field.
struct FieldCandidates {
/// The candidates that start with all the words of the query in the field
start_with_exact: RoaringBitmap,
/// The candidates that have the same number of words as the query in the field
exact_word_count: RoaringBitmap,
}
impl State {
fn start_iteration(
ctx: &mut SearchContext<'_>,
universe: &RoaringBitmap,
query_graph: &QueryGraph,
) -> Result<Self> {
let mut exact_term_position_ids: Vec<(ExactTerm, u16, u8)> = let mut exact_term_position_ids: Vec<(ExactTerm, u16, u8)> =
Vec::with_capacity(query_graph.nodes.len() as usize); Vec::with_capacity(query_graph.nodes.len() as usize);
for (_, node) in query_graph.nodes.iter() { for (_, node) in query_graph.nodes.iter() {
@ -55,11 +100,7 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for ExactAttribute {
let exact_term = if let Some(exact_term) = term.term_subset.exact_term(ctx) { let exact_term = if let Some(exact_term) = term.term_subset.exact_term(ctx) {
exact_term exact_term
} else { } else {
// FIXME: Use `None` or some function indicating that we're passing down the bucket to our child rules continue;
return Ok(Some(RankingRuleOutput {
query: query_graph.clone(),
candidates: universe.clone(),
}));
}; };
exact_term_position_ids.push(( exact_term_position_ids.push((
exact_term, exact_term,
@ -73,14 +114,17 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for ExactAttribute {
exact_term_position_ids.sort_by_key(|(_, _, id)| *id); exact_term_position_ids.sort_by_key(|(_, _, id)| *id);
// bail if there is a "hole" (missing word) in remaining query graph // bail if there is a "hole" (missing word) in remaining query graph
if let Some((_, _, first_id)) = exact_term_position_ids.first() {
if *first_id != 0 {
return Ok(State::Empty(query_graph.clone()));
}
} else {
return Ok(State::Empty(query_graph.clone()));
}
let mut previous_id = 0; let mut previous_id = 0;
for (_, _, id) in exact_term_position_ids.iter().copied() { for (_, _, id) in exact_term_position_ids.iter().copied() {
if id < previous_id || id - previous_id > 1 { if id < previous_id || id - previous_id > 1 {
// FIXME: Use `None` or some function indicating that we're passing down the bucket to our child rules return Ok(State::Empty(query_graph.clone()));
return Ok(Some(RankingRuleOutput {
query: query_graph.clone(),
candidates: universe.clone(),
}));
} else { } else {
previous_id = id; previous_id = id;
} }
@ -102,11 +146,7 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for ExactAttribute {
.collect(); .collect();
for (words, position) in &words_positions { for (words, position) in &words_positions {
if candidates.is_empty() { if candidates.is_empty() {
// FIXME: Use `None` or some function indicating that we're passing down the bucket to our child rules return Ok(State::Empty(query_graph.clone()));
return Ok(Some(RankingRuleOutput {
query: query_graph.clone(),
candidates: universe.clone(),
}));
} }
'words: for (offset, word) in words.iter().enumerate() { 'words: for (offset, word) in words.iter().enumerate() {
@ -116,8 +156,11 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for ExactAttribute {
} else { } else {
continue 'words; continue 'words;
}; };
// Note: Since the position is stored bucketed in word_position_docids, for queries with a lot of
// longer phrases we'll be losing on precision here.
let bucketed_position = crate::bucketed_position(position + offset);
let word_position_docids = CboRoaringBitmapCodec::bytes_decode( let word_position_docids = CboRoaringBitmapCodec::bytes_decode(
ctx.get_db_word_position_docids(*word, position + offset)?.unwrap_or_default(), ctx.get_db_word_position_docids(*word, bucketed_position)?.unwrap_or_default(),
) )
.unwrap_or_default(); .unwrap_or_default();
candidates &= word_position_docids; candidates &= word_position_docids;
@ -127,16 +170,12 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for ExactAttribute {
let candidates = candidates; let candidates = candidates;
if candidates.is_empty() { if candidates.is_empty() {
// FIXME: Use `None` or some function indicating that we're passing down the bucket to our child rules return Ok(State::Empty(query_graph.clone()));
return Ok(Some(RankingRuleOutput {
query: query_graph.clone(),
candidates: universe.clone(),
}));
} }
let searchable_fields_ids = ctx.index.searchable_fields_ids(ctx.txn)?.unwrap_or_default(); let searchable_fields_ids = ctx.index.searchable_fields_ids(ctx.txn)?.unwrap_or_default();
let mut candidates_per_attributes = Vec::with_capacity(searchable_fields_ids.len()); let mut candidates_per_attribute = Vec::with_capacity(searchable_fields_ids.len());
// then check that there exists at least one attribute that has all of the terms // then check that there exists at least one attribute that has all of the terms
for fid in searchable_fields_ids { for fid in searchable_fields_ids {
@ -156,20 +195,59 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for ExactAttribute {
)?; )?;
intersection &= &candidates; intersection &= &candidates;
if !intersection.is_empty() { if !intersection.is_empty() {
candidates_per_attributes.push(intersection); let candidates_with_exact_word_count = ctx
.index
.field_id_word_count_docids
.get(ctx.txn, &(fid, exact_term_position_ids.len() as u8))?
.unwrap_or_default();
candidates_per_attribute.push(FieldCandidates {
start_with_exact: intersection,
exact_word_count: candidates_with_exact_word_count,
});
} }
} }
// note we could have "false positives" where there both exist different attributes that collectively // note we could have "false positives" where there both exist different attributes that collectively
// have the terms in the correct order and a single attribute that have all the terms, but in the incorrect order. // have the terms in the correct order and a single attribute that have all the terms, but in the incorrect order.
let candidates = MultiOps::union(candidates_per_attributes.into_iter()); Ok(State::ExactAttribute(query_graph.clone(), candidates_per_attribute))
Ok(Some(RankingRuleOutput { query: query_graph.clone(), candidates }))
} }
fn end_iteration( fn next(
&mut self, state: State,
_ctx: &mut SearchContext<'ctx>, universe: &RoaringBitmap,
_logger: &mut dyn SearchLogger<QueryGraph>, ) -> (State, Option<RankingRuleOutput<QueryGraph>>) {
) { let (state, output) = match state {
State::Uninitialized => (state, None),
State::ExactAttribute(query_graph, candidates_per_attribute) => {
let mut candidates = MultiOps::union(candidates_per_attribute.iter().map(
|FieldCandidates { start_with_exact, exact_word_count }| {
start_with_exact & exact_word_count
},
));
candidates &= universe;
(
State::AttributeStarts(query_graph.clone(), candidates_per_attribute),
Some(RankingRuleOutput { query: query_graph, candidates }),
)
}
State::AttributeStarts(query_graph, candidates_per_attribute) => {
let mut candidates = MultiOps::union(candidates_per_attribute.into_iter().map(
|FieldCandidates { mut start_with_exact, exact_word_count }| {
start_with_exact -= exact_word_count;
start_with_exact
},
));
candidates &= universe;
(
State::Empty(query_graph.clone()),
Some(RankingRuleOutput { query: query_graph, candidates }),
)
}
State::Empty(query_graph) => (
State::Empty(query_graph.clone()),
Some(RankingRuleOutput { query: query_graph, candidates: universe.clone() }),
),
};
(state, output)
} }
} }