meilisearch/milli/src/search/new/exact_attribute.rs

290 lines
11 KiB
Rust
Raw Normal View History

2023-04-05 14:42:51 +02:00
use roaring::{MultiOps, RoaringBitmap};
2023-04-04 17:12:07 +02:00
use super::query_graph::QueryGraph;
use super::ranking_rules::{RankingRule, RankingRuleOutput};
2023-06-15 17:32:27 +02:00
use crate::score_details::{self, ScoreDetails};
2023-04-04 17:12:07 +02:00
use crate::search::new::query_graph::QueryNodeData;
use crate::search::new::query_term::ExactTerm;
2023-04-11 15:31:40 +02:00
use crate::{Result, SearchContext, SearchLogger};
2023-04-04 17:12:07 +02:00
2023-04-05 14:42:51 +02:00
/// A ranking rule that produces 3 disjoint buckets:
2023-04-04 17:12:07 +02:00
///
2023-04-05 14:42:51 +02:00
/// 1. Documents from the universe whose value is exactly the query.
/// 2. Documents from the universe not in (1) whose value starts with the query.
/// 3. Documents from the universe not in (1) or (2).
2023-04-04 17:12:07 +02:00
pub struct ExactAttribute {
2023-04-05 14:42:51 +02:00
state: State,
2023-04-04 17:12:07 +02:00
}
impl ExactAttribute {
pub fn new() -> Self {
2023-04-05 14:42:51 +02:00
Self { state: Default::default() }
2023-04-04 17:12:07 +02:00
}
}
impl<'ctx> RankingRule<'ctx, QueryGraph> for ExactAttribute {
fn id(&self) -> String {
"exact_attribute".to_owned()
}
fn start_iteration(
&mut self,
2023-04-05 14:42:51 +02:00
ctx: &mut SearchContext<'ctx>,
2023-04-04 17:12:07 +02:00
_logger: &mut dyn SearchLogger<QueryGraph>,
2023-04-05 14:42:51 +02:00
universe: &roaring::RoaringBitmap,
2023-04-04 17:12:07 +02:00
query: &QueryGraph,
) -> Result<()> {
2023-04-05 14:42:51 +02:00
self.state = State::start_iteration(ctx, universe, query)?;
2023-04-04 17:12:07 +02:00
Ok(())
}
fn next_bucket(
&mut self,
2023-04-05 14:42:51 +02:00
_ctx: &mut SearchContext<'ctx>,
2023-04-04 17:12:07 +02:00
_logger: &mut dyn SearchLogger<QueryGraph>,
universe: &roaring::RoaringBitmap,
) -> Result<Option<RankingRuleOutput<QueryGraph>>> {
2023-04-05 14:42:51 +02:00
let state = std::mem::take(&mut self.state);
let (state, output) = State::next(state, universe);
self.state = state;
Ok(output)
}
fn end_iteration(
&mut self,
_ctx: &mut SearchContext<'ctx>,
_logger: &mut dyn SearchLogger<QueryGraph>,
) {
self.state = Default::default();
}
}
/// Inner state of the ranking rule.
#[derive(Default)]
enum State {
/// State between two iterations
#[default]
Uninitialized,
/// The next call to `next` will output the documents in the universe that have an attribute that is the exact query
ExactAttribute(QueryGraph, Vec<FieldCandidates>),
/// The next call to `next` will output the documents in the universe that have an attribute that starts with the exact query,
/// but isn't the exact query.
AttributeStarts(QueryGraph, Vec<FieldCandidates>),
/// The next calls to `next` will output the input universe.
Empty(QueryGraph),
}
/// The candidates sorted by attributes
///
/// Each of the bitmap in a single `FieldCandidates` struct applies to the same field.
struct FieldCandidates {
/// The candidates that start with all the words of the query in the field
start_with_exact: RoaringBitmap,
/// The candidates that have the same number of words as the query in the field
exact_word_count: RoaringBitmap,
}
impl State {
fn start_iteration(
ctx: &mut SearchContext<'_>,
universe: &RoaringBitmap,
query_graph: &QueryGraph,
) -> Result<Self> {
struct ExactTermInfo {
exact_term: ExactTerm,
start_position: u16,
start_term_id: u8,
position_count: usize,
}
let mut exact_terms: Vec<ExactTermInfo> =
2023-04-04 17:12:07 +02:00
Vec::with_capacity(query_graph.nodes.len() as usize);
for (_, node) in query_graph.nodes.iter() {
match &node.data {
QueryNodeData::Term(term) => {
let exact_term = if let Some(exact_term) = term.term_subset.exact_term(ctx) {
exact_term
} else {
2023-04-05 14:42:51 +02:00
continue;
2023-04-04 17:12:07 +02:00
};
exact_terms.push(ExactTermInfo {
2023-04-04 17:12:07 +02:00
exact_term,
start_position: *term.positions.start(),
start_term_id: *term.term_ids.start(),
position_count: term.positions.len(),
});
2023-04-04 17:12:07 +02:00
}
QueryNodeData::Deleted | QueryNodeData::Start | QueryNodeData::End => continue,
}
}
exact_terms.sort_by_key(|x| x.start_term_id);
exact_terms.dedup_by_key(|x| x.start_term_id);
let count_all_positions = exact_terms.iter().fold(0, |acc, x| acc + x.position_count);
2023-04-04 17:12:07 +02:00
// bail if there is a "hole" (missing word) in remaining query graph
if let Some(e) = exact_terms.first() {
if e.start_term_id != 0 {
2023-04-05 14:42:51 +02:00
return Ok(State::Empty(query_graph.clone()));
}
} else {
return Ok(State::Empty(query_graph.clone()));
}
2023-04-04 17:12:07 +02:00
let mut previous_id = 0;
for e in exact_terms.iter() {
if e.start_term_id < previous_id || e.start_term_id - previous_id > 1 {
2023-04-05 14:42:51 +02:00
return Ok(State::Empty(query_graph.clone()));
2023-04-04 17:12:07 +02:00
} else {
previous_id = e.start_term_id;
2023-04-04 17:12:07 +02:00
}
}
// sample query: "sunflower are pretty"
// sunflower at pos 0 in attr A
// are at pos 1 in attr B
// pretty at pos 2 in attr C
// We want to eliminate such document
// first check that for each term, there exists some attribute that has this term at the correct position
//"word-position-docids";
let mut candidates = universe.clone();
let words_positions: Vec<(Vec<_>, _)> = exact_terms
2023-04-04 17:12:07 +02:00
.iter()
.map(|e| (e.exact_term.interned_words(ctx).collect(), e.start_position))
2023-04-04 17:12:07 +02:00
.collect();
for (words, position) in &words_positions {
if candidates.is_empty() {
2023-04-05 14:42:51 +02:00
return Ok(State::Empty(query_graph.clone()));
2023-04-04 17:12:07 +02:00
}
'words: for (offset, word) in words.iter().enumerate() {
let offset = offset as u16;
let word = if let Some(word) = word {
word
} else {
continue 'words;
};
2023-04-05 14:42:51 +02:00
// Note: Since the position is stored bucketed in word_position_docids, for queries with a lot of
// longer phrases we'll be losing on precision here.
let bucketed_position = crate::bucketed_position(position + offset);
2023-04-11 15:31:40 +02:00
let word_position_docids =
2023-05-01 17:20:10 +02:00
ctx.get_db_word_position_docids(*word, bucketed_position)?.unwrap_or_default()
& universe;
2023-04-04 17:12:07 +02:00
candidates &= word_position_docids;
if candidates.is_empty() {
return Ok(State::Empty(query_graph.clone()));
}
2023-04-04 17:12:07 +02:00
}
}
let candidates = candidates;
if candidates.is_empty() {
2023-04-05 14:42:51 +02:00
return Ok(State::Empty(query_graph.clone()));
2023-04-04 17:12:07 +02:00
}
2023-05-01 17:20:10 +02:00
let searchable_fields_ids = {
if let Some(fids) = ctx.index.searchable_fields_ids(ctx.txn)? {
fids
} else {
ctx.index.fields_ids_map(ctx.txn)?.ids().collect()
}
};
2023-04-04 17:12:07 +02:00
2023-04-05 14:42:51 +02:00
let mut candidates_per_attribute = Vec::with_capacity(searchable_fields_ids.len());
2023-04-04 17:12:07 +02:00
// then check that there exists at least one attribute that has all of the terms
for fid in searchable_fields_ids {
let mut intersection = MultiOps::intersection(
words_positions
.iter()
.flat_map(|(words, ..)| words.iter())
// ignore stop words words in phrases
.flatten()
.map(|word| -> Result<_> {
2023-04-11 15:31:40 +02:00
Ok(ctx.get_db_word_fid_docids(*word, fid)?.unwrap_or_default())
2023-04-04 17:12:07 +02:00
}),
)?;
intersection &= &candidates;
if !intersection.is_empty() {
2023-06-06 11:31:26 +02:00
// Although not really worth it in terms of performance,
// if would be good to put this in cache for the sake of consistency
let candidates_with_exact_word_count = if count_all_positions < u8::MAX as usize {
ctx.index
.field_id_word_count_docids
.get(ctx.txn, &(fid, count_all_positions as u8))?
.unwrap_or_default()
2023-05-01 17:20:10 +02:00
& universe
} else {
RoaringBitmap::default()
};
2023-04-05 14:42:51 +02:00
candidates_per_attribute.push(FieldCandidates {
start_with_exact: intersection,
exact_word_count: candidates_with_exact_word_count,
});
2023-04-04 17:12:07 +02:00
}
}
// note we could have "false positives" where there both exist different attributes that collectively
// have the terms in the correct order and a single attribute that have all the terms, but in the incorrect order.
2023-04-05 14:42:51 +02:00
Ok(State::ExactAttribute(query_graph.clone(), candidates_per_attribute))
2023-04-04 17:12:07 +02:00
}
2023-04-05 14:42:51 +02:00
fn next(
state: State,
universe: &RoaringBitmap,
) -> (State, Option<RankingRuleOutput<QueryGraph>>) {
let (state, output) = match state {
State::Uninitialized => (state, None),
State::ExactAttribute(query_graph, candidates_per_attribute) => {
let mut candidates = MultiOps::union(candidates_per_attribute.iter().map(
|FieldCandidates { start_with_exact, exact_word_count }| {
start_with_exact & exact_word_count
},
));
candidates &= universe;
(
State::AttributeStarts(query_graph.clone(), candidates_per_attribute),
2023-06-15 17:32:27 +02:00
Some(RankingRuleOutput {
query: query_graph,
candidates,
score: ScoreDetails::ExactAttribute(
score_details::ExactAttribute::ExactMatch,
),
}),
2023-04-05 14:42:51 +02:00
)
}
State::AttributeStarts(query_graph, candidates_per_attribute) => {
let mut candidates = MultiOps::union(candidates_per_attribute.into_iter().map(
|FieldCandidates { mut start_with_exact, exact_word_count }| {
start_with_exact -= exact_word_count;
start_with_exact
},
));
candidates &= universe;
(
State::Empty(query_graph.clone()),
2023-06-15 17:32:27 +02:00
Some(RankingRuleOutput {
query: query_graph,
candidates,
score: ScoreDetails::ExactAttribute(
score_details::ExactAttribute::MatchesStart,
),
}),
2023-04-05 14:42:51 +02:00
)
}
State::Empty(query_graph) => (
State::Empty(query_graph.clone()),
2023-06-15 17:32:27 +02:00
Some(RankingRuleOutput {
query: query_graph,
candidates: universe.clone(),
score: ScoreDetails::ExactAttribute(
score_details::ExactAttribute::NoExactMatch,
),
}),
2023-04-05 14:42:51 +02:00
),
};
(state, output)
2023-04-04 17:12:07 +02:00
}
}