mirror of
https://github.com/meilisearch/meilisearch.git
synced 2024-11-23 10:37:41 +08:00
Add exactness ranking rules
This commit is contained in:
parent
1b8e4d0301
commit
8a13ed7e3f
175
milli/src/search/new/exact_attribute.rs
Normal file
175
milli/src/search/new/exact_attribute.rs
Normal file
@ -0,0 +1,175 @@
|
|||||||
|
use heed::BytesDecode;
|
||||||
|
use roaring::MultiOps;
|
||||||
|
|
||||||
|
use super::query_graph::QueryGraph;
|
||||||
|
use super::ranking_rules::{RankingRule, RankingRuleOutput};
|
||||||
|
use crate::search::new::query_graph::QueryNodeData;
|
||||||
|
use crate::search::new::query_term::ExactTerm;
|
||||||
|
use crate::{CboRoaringBitmapCodec, Result, SearchContext, SearchLogger};
|
||||||
|
|
||||||
|
/// FIXME:
|
||||||
|
///
|
||||||
|
/// - A lot of work done in next_bucket that start_iteration could do.
|
||||||
|
/// - Consider calling the graph based rule directly from this one.
|
||||||
|
/// - currently we did exact term, don't forget about prefix
|
||||||
|
/// - some tests
|
||||||
|
pub struct ExactAttribute {
|
||||||
|
query_graph: Option<QueryGraph>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ExactAttribute {
|
||||||
|
pub fn new() -> Self {
|
||||||
|
Self { query_graph: None }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'ctx> RankingRule<'ctx, QueryGraph> for ExactAttribute {
|
||||||
|
fn id(&self) -> String {
|
||||||
|
"exact_attribute".to_owned()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn start_iteration(
|
||||||
|
&mut self,
|
||||||
|
_ctx: &mut SearchContext<'ctx>,
|
||||||
|
_logger: &mut dyn SearchLogger<QueryGraph>,
|
||||||
|
_universe: &roaring::RoaringBitmap,
|
||||||
|
query: &QueryGraph,
|
||||||
|
) -> Result<()> {
|
||||||
|
self.query_graph = Some(query.clone());
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn next_bucket(
|
||||||
|
&mut self,
|
||||||
|
ctx: &mut SearchContext<'ctx>,
|
||||||
|
_logger: &mut dyn SearchLogger<QueryGraph>,
|
||||||
|
universe: &roaring::RoaringBitmap,
|
||||||
|
) -> Result<Option<RankingRuleOutput<QueryGraph>>> {
|
||||||
|
// iterate on the nodes of the graph, retain LocatedQueryTermSubset
|
||||||
|
let query_graph = self.query_graph.as_ref().unwrap();
|
||||||
|
let mut exact_term_position_ids: Vec<(ExactTerm, u16, u8)> =
|
||||||
|
Vec::with_capacity(query_graph.nodes.len() as usize);
|
||||||
|
for (_, node) in query_graph.nodes.iter() {
|
||||||
|
match &node.data {
|
||||||
|
QueryNodeData::Term(term) => {
|
||||||
|
let exact_term = if let Some(exact_term) = term.term_subset.exact_term(ctx) {
|
||||||
|
exact_term
|
||||||
|
} else {
|
||||||
|
// FIXME: Use `None` or some function indicating that we're passing down the bucket to our child rules
|
||||||
|
return Ok(Some(RankingRuleOutput {
|
||||||
|
query: query_graph.clone(),
|
||||||
|
candidates: universe.clone(),
|
||||||
|
}));
|
||||||
|
};
|
||||||
|
exact_term_position_ids.push((
|
||||||
|
exact_term,
|
||||||
|
*term.positions.start(),
|
||||||
|
*term.term_ids.start(),
|
||||||
|
))
|
||||||
|
}
|
||||||
|
QueryNodeData::Deleted | QueryNodeData::Start | QueryNodeData::End => continue,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
exact_term_position_ids.sort_by_key(|(_, _, id)| *id);
|
||||||
|
// bail if there is a "hole" (missing word) in remaining query graph
|
||||||
|
let mut previous_id = 0;
|
||||||
|
for (_, _, id) in exact_term_position_ids.iter().copied() {
|
||||||
|
if id < previous_id || id - previous_id > 1 {
|
||||||
|
// FIXME: Use `None` or some function indicating that we're passing down the bucket to our child rules
|
||||||
|
return Ok(Some(RankingRuleOutput {
|
||||||
|
query: query_graph.clone(),
|
||||||
|
candidates: universe.clone(),
|
||||||
|
}));
|
||||||
|
} else {
|
||||||
|
previous_id = id;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// sample query: "sunflower are pretty"
|
||||||
|
// sunflower at pos 0 in attr A
|
||||||
|
// are at pos 1 in attr B
|
||||||
|
// pretty at pos 2 in attr C
|
||||||
|
// We want to eliminate such document
|
||||||
|
|
||||||
|
// first check that for each term, there exists some attribute that has this term at the correct position
|
||||||
|
//"word-position-docids";
|
||||||
|
let mut candidates = universe.clone();
|
||||||
|
let words_positions: Vec<(Vec<_>, _)> = exact_term_position_ids
|
||||||
|
.iter()
|
||||||
|
.copied()
|
||||||
|
.map(|(term, position, _)| (term.interned_words(ctx).collect(), position))
|
||||||
|
.collect();
|
||||||
|
for (words, position) in &words_positions {
|
||||||
|
if candidates.is_empty() {
|
||||||
|
// FIXME: Use `None` or some function indicating that we're passing down the bucket to our child rules
|
||||||
|
return Ok(Some(RankingRuleOutput {
|
||||||
|
query: query_graph.clone(),
|
||||||
|
candidates: universe.clone(),
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
|
||||||
|
'words: for (offset, word) in words.iter().enumerate() {
|
||||||
|
let offset = offset as u16;
|
||||||
|
let word = if let Some(word) = word {
|
||||||
|
word
|
||||||
|
} else {
|
||||||
|
continue 'words;
|
||||||
|
};
|
||||||
|
let word_position_docids = CboRoaringBitmapCodec::bytes_decode(
|
||||||
|
ctx.get_db_word_position_docids(*word, position + offset)?.unwrap_or_default(),
|
||||||
|
)
|
||||||
|
.unwrap_or_default();
|
||||||
|
candidates &= word_position_docids;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let candidates = candidates;
|
||||||
|
|
||||||
|
if candidates.is_empty() {
|
||||||
|
// FIXME: Use `None` or some function indicating that we're passing down the bucket to our child rules
|
||||||
|
return Ok(Some(RankingRuleOutput {
|
||||||
|
query: query_graph.clone(),
|
||||||
|
candidates: universe.clone(),
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
|
||||||
|
let searchable_fields_ids = ctx.index.searchable_fields_ids(ctx.txn)?.unwrap_or_default();
|
||||||
|
|
||||||
|
let mut candidates_per_attributes = Vec::with_capacity(searchable_fields_ids.len());
|
||||||
|
|
||||||
|
// then check that there exists at least one attribute that has all of the terms
|
||||||
|
for fid in searchable_fields_ids {
|
||||||
|
let mut intersection = MultiOps::intersection(
|
||||||
|
words_positions
|
||||||
|
.iter()
|
||||||
|
.flat_map(|(words, ..)| words.iter())
|
||||||
|
// ignore stop words words in phrases
|
||||||
|
.flatten()
|
||||||
|
.map(|word| -> Result<_> {
|
||||||
|
Ok(ctx
|
||||||
|
.get_db_word_fid_docids(*word, fid)?
|
||||||
|
.map(CboRoaringBitmapCodec::bytes_decode)
|
||||||
|
.unwrap_or_default()
|
||||||
|
.unwrap_or_default())
|
||||||
|
}),
|
||||||
|
)?;
|
||||||
|
intersection &= &candidates;
|
||||||
|
if !intersection.is_empty() {
|
||||||
|
candidates_per_attributes.push(intersection);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// note we could have "false positives" where there both exist different attributes that collectively
|
||||||
|
// have the terms in the correct order and a single attribute that have all the terms, but in the incorrect order.
|
||||||
|
|
||||||
|
let candidates = MultiOps::union(candidates_per_attributes.into_iter());
|
||||||
|
Ok(Some(RankingRuleOutput { query: query_graph.clone(), candidates }))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn end_iteration(
|
||||||
|
&mut self,
|
||||||
|
_ctx: &mut SearchContext<'ctx>,
|
||||||
|
_logger: &mut dyn SearchLogger<QueryGraph>,
|
||||||
|
) {
|
||||||
|
}
|
||||||
|
}
|
@ -44,8 +44,8 @@ use super::interner::{Interned, MappedInterner};
|
|||||||
use super::logger::SearchLogger;
|
use super::logger::SearchLogger;
|
||||||
use super::query_graph::QueryNode;
|
use super::query_graph::QueryNode;
|
||||||
use super::ranking_rule_graph::{
|
use super::ranking_rule_graph::{
|
||||||
ConditionDocIdsCache, DeadEndsCache, ProximityGraph, RankingRuleGraph, RankingRuleGraphTrait,
|
ConditionDocIdsCache, DeadEndsCache, ExactnessGraph, ProximityGraph, RankingRuleGraph,
|
||||||
TypoGraph,
|
RankingRuleGraphTrait, TypoGraph,
|
||||||
};
|
};
|
||||||
use super::small_bitmap::SmallBitmap;
|
use super::small_bitmap::SmallBitmap;
|
||||||
use super::{QueryGraph, RankingRule, RankingRuleOutput, SearchContext};
|
use super::{QueryGraph, RankingRule, RankingRuleOutput, SearchContext};
|
||||||
@ -65,6 +65,12 @@ impl GraphBasedRankingRule<TypoGraph> {
|
|||||||
Self::new_with_id("typo".to_owned(), terms_matching_strategy)
|
Self::new_with_id("typo".to_owned(), terms_matching_strategy)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
pub type Exactness = GraphBasedRankingRule<ExactnessGraph>;
|
||||||
|
impl GraphBasedRankingRule<ExactnessGraph> {
|
||||||
|
pub fn new() -> Self {
|
||||||
|
Self::new_with_id("exactness".to_owned(), None)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// A generic graph-based ranking rule
|
/// A generic graph-based ranking rule
|
||||||
pub struct GraphBasedRankingRule<G: RankingRuleGraphTrait> {
|
pub struct GraphBasedRankingRule<G: RankingRuleGraphTrait> {
|
||||||
|
@ -9,8 +9,9 @@ mod query_term;
|
|||||||
mod ranking_rule_graph;
|
mod ranking_rule_graph;
|
||||||
mod ranking_rules;
|
mod ranking_rules;
|
||||||
mod resolve_query_graph;
|
mod resolve_query_graph;
|
||||||
// TODO: documentation + comments
|
|
||||||
mod small_bitmap;
|
mod small_bitmap;
|
||||||
|
|
||||||
|
mod exact_attribute;
|
||||||
// TODO: documentation + comments
|
// TODO: documentation + comments
|
||||||
// implementation is currently an adaptation of the previous implementation to fit with the new model
|
// implementation is currently an adaptation of the previous implementation to fit with the new model
|
||||||
mod sort;
|
mod sort;
|
||||||
@ -33,6 +34,8 @@ use resolve_query_graph::PhraseDocIdsCache;
|
|||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
use words::Words;
|
use words::Words;
|
||||||
|
|
||||||
|
use self::exact_attribute::ExactAttribute;
|
||||||
|
use self::graph_based_ranking_rule::Exactness;
|
||||||
use self::interner::Interner;
|
use self::interner::Interner;
|
||||||
use self::ranking_rules::{BoxRankingRule, RankingRule};
|
use self::ranking_rules::{BoxRankingRule, RankingRule};
|
||||||
use self::resolve_query_graph::compute_query_graph_docids;
|
use self::resolve_query_graph::compute_query_graph_docids;
|
||||||
@ -150,7 +153,7 @@ fn get_ranking_rules_for_query_graph_search<'ctx>(
|
|||||||
let mut proximity = false;
|
let mut proximity = false;
|
||||||
let mut sort = false;
|
let mut sort = false;
|
||||||
let attribute = false;
|
let attribute = false;
|
||||||
let exactness = false;
|
let mut exactness = false;
|
||||||
let mut asc = HashSet::new();
|
let mut asc = HashSet::new();
|
||||||
let mut desc = HashSet::new();
|
let mut desc = HashSet::new();
|
||||||
|
|
||||||
@ -211,8 +214,9 @@ fn get_ranking_rules_for_query_graph_search<'ctx>(
|
|||||||
if exactness {
|
if exactness {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
// todo!();
|
ranking_rules.push(Box::new(ExactAttribute::new()));
|
||||||
// exactness = false;
|
ranking_rules.push(Box::new(Exactness::new()));
|
||||||
|
exactness = true;
|
||||||
}
|
}
|
||||||
crate::Criterion::Asc(field_name) => {
|
crate::Criterion::Asc(field_name) => {
|
||||||
if asc.contains(&field_name) {
|
if asc.contains(&field_name) {
|
||||||
|
107
milli/src/search/new/ranking_rule_graph/exactness/mod.rs
Normal file
107
milli/src/search/new/ranking_rule_graph/exactness/mod.rs
Normal file
@ -0,0 +1,107 @@
|
|||||||
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
|
use super::{ComputedCondition, DeadEndsCache, RankingRuleGraph, RankingRuleGraphTrait};
|
||||||
|
use crate::search::new::interner::{DedupInterner, Interned, MappedInterner};
|
||||||
|
use crate::search::new::query_graph::{QueryGraph, QueryNode};
|
||||||
|
use crate::search::new::query_term::{ExactTerm, LocatedQueryTermSubset};
|
||||||
|
use crate::{CboRoaringBitmapCodec, Result, SearchContext, SearchLogger};
|
||||||
|
|
||||||
|
/// - Exactness as first ranking rule: TermsMatchingStrategy? prefer a document that matches 1 word exactly and no other
|
||||||
|
/// word than a doc that matches 9 words non exactly but none exactly
|
||||||
|
/// - `TermsMatchingStrategy` as a word + exactness optimization: we could consider
|
||||||
|
///
|
||||||
|
/// "naive vision"
|
||||||
|
/// condition from one node to another:
|
||||||
|
/// - word exactly present: cost 0
|
||||||
|
/// - word typo/ngram/prefix/missing: cost 1, not remove from query graph, edge btwn the two nodes, return the universe without condition when resolving, destination query term is inside
|
||||||
|
///
|
||||||
|
/// Three strategies:
|
||||||
|
/// 1. ExactAttribute: word position / word_fid_docid
|
||||||
|
/// 2. AttributeStart:
|
||||||
|
/// 3. AttributeContainsExact => implementable via `RankingRuleGraphTrait`
|
||||||
|
|
||||||
|
#[derive(Clone, PartialEq, Eq, Hash)]
|
||||||
|
pub enum ExactnessCondition {
|
||||||
|
ExactInAttribute(LocatedQueryTermSubset),
|
||||||
|
Skip(LocatedQueryTermSubset),
|
||||||
|
}
|
||||||
|
|
||||||
|
pub enum ExactnessGraph {}
|
||||||
|
|
||||||
|
fn compute_docids(
|
||||||
|
ctx: &mut SearchContext,
|
||||||
|
dest_node: &LocatedQueryTermSubset,
|
||||||
|
universe: &RoaringBitmap,
|
||||||
|
) -> Result<RoaringBitmap> {
|
||||||
|
let exact_term = if let Some(exact_term) = dest_node.term_subset.exact_term(ctx) {
|
||||||
|
exact_term
|
||||||
|
} else {
|
||||||
|
return Ok(Default::default());
|
||||||
|
};
|
||||||
|
let mut candidates = match exact_term {
|
||||||
|
ExactTerm::Phrase(phrase) => ctx.get_phrase_docids(phrase)?.clone(),
|
||||||
|
ExactTerm::Word(word) => {
|
||||||
|
if let Some(word_candidates) = ctx.get_db_word_docids(word)? {
|
||||||
|
CboRoaringBitmapCodec::deserialize_from(word_candidates)?
|
||||||
|
} else {
|
||||||
|
return Ok(Default::default());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
// TODO: synonyms?
|
||||||
|
candidates &= universe;
|
||||||
|
Ok(candidates)
|
||||||
|
}
|
||||||
|
|
||||||
|
impl RankingRuleGraphTrait for ExactnessGraph {
|
||||||
|
type Condition = ExactnessCondition;
|
||||||
|
|
||||||
|
fn resolve_condition(
|
||||||
|
ctx: &mut SearchContext,
|
||||||
|
condition: &Self::Condition,
|
||||||
|
universe: &RoaringBitmap,
|
||||||
|
) -> Result<ComputedCondition> {
|
||||||
|
let (docids, dest_node) = match condition {
|
||||||
|
ExactnessCondition::ExactInAttribute(dest_node) => {
|
||||||
|
(compute_docids(ctx, dest_node, universe)?, dest_node)
|
||||||
|
}
|
||||||
|
ExactnessCondition::Skip(dest_node) => (universe.clone(), dest_node),
|
||||||
|
};
|
||||||
|
Ok(ComputedCondition {
|
||||||
|
docids,
|
||||||
|
universe_len: universe.len(),
|
||||||
|
start_term_subset: None,
|
||||||
|
end_term_subset: dest_node.clone(),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
fn build_edges(
|
||||||
|
_ctx: &mut SearchContext,
|
||||||
|
conditions_interner: &mut DedupInterner<Self::Condition>,
|
||||||
|
_source_node: Option<&LocatedQueryTermSubset>,
|
||||||
|
dest_node: &LocatedQueryTermSubset,
|
||||||
|
) -> Result<Vec<(u32, Interned<Self::Condition>)>> {
|
||||||
|
let exact_condition = ExactnessCondition::ExactInAttribute(dest_node.clone());
|
||||||
|
let exact_condition = conditions_interner.insert(exact_condition);
|
||||||
|
|
||||||
|
let skip_condition = ExactnessCondition::Skip(dest_node.clone());
|
||||||
|
let skip_condition = conditions_interner.insert(skip_condition);
|
||||||
|
Ok(vec![(0, exact_condition), (1, skip_condition)])
|
||||||
|
}
|
||||||
|
|
||||||
|
fn log_state(
|
||||||
|
graph: &RankingRuleGraph<Self>,
|
||||||
|
paths: &[Vec<Interned<Self::Condition>>],
|
||||||
|
dead_ends_cache: &DeadEndsCache<Self::Condition>,
|
||||||
|
universe: &RoaringBitmap,
|
||||||
|
costs: &MappedInterner<QueryNode, Vec<u64>>,
|
||||||
|
cost: u64,
|
||||||
|
logger: &mut dyn SearchLogger<QueryGraph>,
|
||||||
|
) {
|
||||||
|
todo!()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn label_for_condition(ctx: &mut SearchContext, condition: &Self::Condition) -> Result<String> {
|
||||||
|
todo!()
|
||||||
|
}
|
||||||
|
}
|
@ -10,6 +10,8 @@ mod cheapest_paths;
|
|||||||
mod condition_docids_cache;
|
mod condition_docids_cache;
|
||||||
mod dead_ends_cache;
|
mod dead_ends_cache;
|
||||||
|
|
||||||
|
/// Implementation of the `exactness` ranking rule
|
||||||
|
mod exactness;
|
||||||
/// Implementation of the `proximity` ranking rule
|
/// Implementation of the `proximity` ranking rule
|
||||||
mod proximity;
|
mod proximity;
|
||||||
/// Implementation of the `typo` ranking rule
|
/// Implementation of the `typo` ranking rule
|
||||||
@ -20,6 +22,7 @@ use std::hash::Hash;
|
|||||||
pub use cheapest_paths::PathVisitor;
|
pub use cheapest_paths::PathVisitor;
|
||||||
pub use condition_docids_cache::ConditionDocIdsCache;
|
pub use condition_docids_cache::ConditionDocIdsCache;
|
||||||
pub use dead_ends_cache::DeadEndsCache;
|
pub use dead_ends_cache::DeadEndsCache;
|
||||||
|
pub use exactness::{ExactnessCondition, ExactnessGraph};
|
||||||
pub use proximity::{ProximityCondition, ProximityGraph};
|
pub use proximity::{ProximityCondition, ProximityGraph};
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
pub use typo::{TypoCondition, TypoGraph};
|
pub use typo::{TypoCondition, TypoGraph};
|
||||||
|
Loading…
Reference in New Issue
Block a user