mirror of
https://github.com/meilisearch/meilisearch.git
synced 2024-12-02 01:55:03 +08:00
Score for graph based ranking rules
Count phrases in matchingWords and maxMatchingWords
This commit is contained in:
parent
8939e85f60
commit
c621a250a7
@ -50,6 +50,7 @@ use super::ranking_rule_graph::{
|
|||||||
};
|
};
|
||||||
use super::small_bitmap::SmallBitmap;
|
use super::small_bitmap::SmallBitmap;
|
||||||
use super::{QueryGraph, RankingRule, RankingRuleOutput, SearchContext};
|
use super::{QueryGraph, RankingRule, RankingRuleOutput, SearchContext};
|
||||||
|
use crate::score_details::Rank;
|
||||||
use crate::search::new::query_term::LocatedQueryTermSubset;
|
use crate::search::new::query_term::LocatedQueryTermSubset;
|
||||||
use crate::search::new::ranking_rule_graph::PathVisitor;
|
use crate::search::new::ranking_rule_graph::PathVisitor;
|
||||||
use crate::{Result, TermsMatchingStrategy};
|
use crate::{Result, TermsMatchingStrategy};
|
||||||
@ -118,6 +119,8 @@ pub struct GraphBasedRankingRuleState<G: RankingRuleGraphTrait> {
|
|||||||
all_costs: MappedInterner<QueryNode, Vec<u64>>,
|
all_costs: MappedInterner<QueryNode, Vec<u64>>,
|
||||||
/// An index in the first element of `all_distances`, giving the cost of the next bucket
|
/// An index in the first element of `all_distances`, giving the cost of the next bucket
|
||||||
cur_cost: u64,
|
cur_cost: u64,
|
||||||
|
/// One above the highest possible cost for this rule
|
||||||
|
next_max_cost: u64,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBasedRankingRule<G> {
|
impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBasedRankingRule<G> {
|
||||||
@ -131,7 +134,20 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase
|
|||||||
_universe: &RoaringBitmap,
|
_universe: &RoaringBitmap,
|
||||||
query_graph: &QueryGraph,
|
query_graph: &QueryGraph,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
|
// the `next_max_cost` is the successor integer to the maximum cost of the paths in the graph.
|
||||||
|
//
|
||||||
|
// When there is a matching strategy, it also factors the additional costs of:
|
||||||
|
// 1. The words that are matched in phrases
|
||||||
|
// 2. Skipping words (by adding them to the paths with a cost)
|
||||||
|
let mut next_max_cost = 1;
|
||||||
let removal_cost = if let Some(terms_matching_strategy) = self.terms_matching_strategy {
|
let removal_cost = if let Some(terms_matching_strategy) = self.terms_matching_strategy {
|
||||||
|
// add the cost of the phrase to the next_max_cost
|
||||||
|
next_max_cost += query_graph
|
||||||
|
.words_in_phrases_count(ctx)
|
||||||
|
// remove 1 from the words in phrases count, because when there is a phrase we can now have a document
|
||||||
|
// where only the phrase is matching, and none of the non-phrase words.
|
||||||
|
// With the `1` that `next_max_cost` is initialized with, this gets counted twice.
|
||||||
|
.saturating_sub(1) as u64;
|
||||||
match terms_matching_strategy {
|
match terms_matching_strategy {
|
||||||
TermsMatchingStrategy::Last => {
|
TermsMatchingStrategy::Last => {
|
||||||
let removal_order =
|
let removal_order =
|
||||||
@ -161,12 +177,16 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase
|
|||||||
// Then pre-compute the cost of all paths from each node to the end node
|
// Then pre-compute the cost of all paths from each node to the end node
|
||||||
let all_costs = graph.find_all_costs_to_end();
|
let all_costs = graph.find_all_costs_to_end();
|
||||||
|
|
||||||
|
next_max_cost +=
|
||||||
|
all_costs.get(graph.query_graph.root_node).iter().copied().max().unwrap_or(0);
|
||||||
|
|
||||||
let state = GraphBasedRankingRuleState {
|
let state = GraphBasedRankingRuleState {
|
||||||
graph,
|
graph,
|
||||||
conditions_cache: condition_docids_cache,
|
conditions_cache: condition_docids_cache,
|
||||||
dead_ends_cache,
|
dead_ends_cache,
|
||||||
all_costs,
|
all_costs,
|
||||||
cur_cost: 0,
|
cur_cost: 0,
|
||||||
|
next_max_cost,
|
||||||
};
|
};
|
||||||
|
|
||||||
self.state = Some(state);
|
self.state = Some(state);
|
||||||
@ -180,17 +200,13 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase
|
|||||||
logger: &mut dyn SearchLogger<QueryGraph>,
|
logger: &mut dyn SearchLogger<QueryGraph>,
|
||||||
universe: &RoaringBitmap,
|
universe: &RoaringBitmap,
|
||||||
) -> Result<Option<RankingRuleOutput<QueryGraph>>> {
|
) -> Result<Option<RankingRuleOutput<QueryGraph>>> {
|
||||||
// If universe.len() <= 1, the bucket sort algorithm
|
|
||||||
// should not have called this function.
|
|
||||||
assert!(universe.len() > 1);
|
|
||||||
// Will crash if `next_bucket` is called before `start_iteration` or after `end_iteration`,
|
// Will crash if `next_bucket` is called before `start_iteration` or after `end_iteration`,
|
||||||
// should never happen
|
// should never happen
|
||||||
let mut state = self.state.take().unwrap();
|
let mut state = self.state.take().unwrap();
|
||||||
|
|
||||||
|
let all_costs = state.all_costs.get(state.graph.query_graph.root_node);
|
||||||
// Retrieve the cost of the paths to compute
|
// Retrieve the cost of the paths to compute
|
||||||
let Some(&cost) = state
|
let Some(&cost) = all_costs
|
||||||
.all_costs
|
|
||||||
.get(state.graph.query_graph.root_node)
|
|
||||||
.iter()
|
.iter()
|
||||||
.find(|c| **c >= state.cur_cost) else {
|
.find(|c| **c >= state.cur_cost) else {
|
||||||
self.state = None;
|
self.state = None;
|
||||||
@ -206,8 +222,12 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase
|
|||||||
dead_ends_cache,
|
dead_ends_cache,
|
||||||
all_costs,
|
all_costs,
|
||||||
cur_cost: _,
|
cur_cost: _,
|
||||||
|
next_max_cost,
|
||||||
} = &mut state;
|
} = &mut state;
|
||||||
|
|
||||||
|
let rank = *next_max_cost - cost;
|
||||||
|
let score = G::rank_to_score(Rank { rank: rank as u32, max_rank: *next_max_cost as u32 });
|
||||||
|
|
||||||
let mut universe = universe.clone();
|
let mut universe = universe.clone();
|
||||||
|
|
||||||
let mut used_conditions = SmallBitmap::for_interned_values_in(&graph.conditions_interner);
|
let mut used_conditions = SmallBitmap::for_interned_values_in(&graph.conditions_interner);
|
||||||
@ -322,7 +342,7 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase
|
|||||||
|
|
||||||
self.state = Some(state);
|
self.state = Some(state);
|
||||||
|
|
||||||
Ok(Some(RankingRuleOutput { query: next_query_graph, candidates: bucket }))
|
Ok(Some(RankingRuleOutput { query: next_query_graph, candidates: bucket, score }))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn end_iteration(
|
fn end_iteration(
|
||||||
|
@ -342,6 +342,25 @@ impl QueryGraph {
|
|||||||
}
|
}
|
||||||
res
|
res
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Number of words in the phrases in this query graph
|
||||||
|
pub(crate) fn words_in_phrases_count(&self, ctx: &SearchContext) -> usize {
|
||||||
|
let mut word_count = 0;
|
||||||
|
for (_, node) in self.nodes.iter() {
|
||||||
|
match &node.data {
|
||||||
|
QueryNodeData::Term(term) => {
|
||||||
|
let Some(phrase) = term.term_subset.original_phrase(ctx)
|
||||||
|
else {
|
||||||
|
continue
|
||||||
|
};
|
||||||
|
let phrase = ctx.phrase_interner.get(phrase);
|
||||||
|
word_count += phrase.words.iter().copied().filter(|a| a.is_some()).count()
|
||||||
|
}
|
||||||
|
_ => continue,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
word_count
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn add_node(nodes_data: &mut Vec<QueryNodeData>, node_data: QueryNodeData) -> u16 {
|
fn add_node(nodes_data: &mut Vec<QueryNodeData>, node_data: QueryNodeData) -> u16 {
|
||||||
|
Loading…
Reference in New Issue
Block a user