Implement Frequency matching strategy

This commit is contained in:
ManyTheFish 2024-05-29 11:06:39 +02:00
parent 19acc65ad2
commit abdc4afcca
5 changed files with 78 additions and 1 deletions

View File

@ -276,6 +276,8 @@ pub enum TermsMatchingStrategy {
Last, Last,
// all words are mandatory // all words are mandatory
All, All,
// remove more frequent word first
Frequency,
} }
impl Default for TermsMatchingStrategy { impl Default for TermsMatchingStrategy {

View File

@ -164,6 +164,21 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase
} }
costs costs
} }
TermsMatchingStrategy::Frequency => {
let removal_order =
query_graph.removal_order_for_terms_matching_strategy_frequency(ctx)?;
let mut forbidden_nodes =
SmallBitmap::for_interned_values_in(&query_graph.nodes);
let mut costs = query_graph.nodes.map(|_| None);
// FIXME: this works because only words uses termsmatchingstrategy at the moment.
for ns in removal_order {
for n in ns.iter() {
*costs.get_mut(n) = Some((1, forbidden_nodes.clone()));
}
forbidden_nodes.union(&ns);
}
costs
}
TermsMatchingStrategy::All => query_graph.nodes.map(|_| None), TermsMatchingStrategy::All => query_graph.nodes.map(|_| None),
} }
} else { } else {

View File

@ -197,6 +197,11 @@ fn resolve_maximally_reduced_query_graph(
.iter() .iter()
.flat_map(|x| x.iter()) .flat_map(|x| x.iter())
.collect(), .collect(),
TermsMatchingStrategy::Frequency => query_graph
.removal_order_for_terms_matching_strategy_frequency(ctx)?
.iter()
.flat_map(|x| x.iter())
.collect(),
TermsMatchingStrategy::All => vec![], TermsMatchingStrategy::All => vec![],
}; };
graph.remove_nodes_keep_edges(&nodes_to_remove); graph.remove_nodes_keep_edges(&nodes_to_remove);

View File

@ -3,6 +3,7 @@ use std::collections::BTreeMap;
use std::hash::{Hash, Hasher}; use std::hash::{Hash, Hasher};
use fxhash::{FxHashMap, FxHasher}; use fxhash::{FxHashMap, FxHasher};
use roaring::RoaringBitmap;
use super::interner::{FixedSizeInterner, Interned}; use super::interner::{FixedSizeInterner, Interned};
use super::query_term::{ use super::query_term::{
@ -11,6 +12,7 @@ use super::query_term::{
use super::small_bitmap::SmallBitmap; use super::small_bitmap::SmallBitmap;
use super::SearchContext; use super::SearchContext;
use crate::search::new::interner::Interner; use crate::search::new::interner::Interner;
use crate::search::new::resolve_query_graph::compute_query_term_subset_docids;
use crate::Result; use crate::Result;
/// A node of the [`QueryGraph`]. /// A node of the [`QueryGraph`].
@ -290,6 +292,49 @@ impl QueryGraph {
} }
} }
pub fn removal_order_for_terms_matching_strategy_frequency(
&self,
ctx: &mut SearchContext,
) -> Result<Vec<SmallBitmap<QueryNode>>> {
// lookup frequency for each term
let mut term_with_frequency: Vec<(u8, u64)> = {
let mut term_docids: BTreeMap<u8, RoaringBitmap> = Default::default();
for (_, node) in self.nodes.iter() {
match &node.data {
QueryNodeData::Term(t) => {
let docids = compute_query_term_subset_docids(ctx, &t.term_subset)?;
for id in t.term_ids.clone() {
term_docids
.entry(id)
.and_modify(|curr| *curr |= &docids)
.or_insert_with(|| docids.clone());
}
}
QueryNodeData::Deleted | QueryNodeData::Start | QueryNodeData::End => continue,
}
}
term_docids
.into_iter()
.map(|(idx, docids)| match docids.len() {
0 => (idx, u64::max_value()),
frequency => (idx, frequency),
})
.collect()
};
term_with_frequency.sort_by_key(|(_, frequency)| *frequency);
let mut term_weight = BTreeMap::new();
let mut weight: u16 = 1;
let mut peekable = term_with_frequency.into_iter().peekable();
while let Some((idx, frequency)) = peekable.next() {
term_weight.insert(idx, weight);
if peekable.peek().map_or(false, |(_, f)| frequency < *f) {
weight += 1;
}
}
let cost_of_term_idx = move |term_idx: u8| *term_weight.get(&term_idx).unwrap();
Ok(self.removal_order_for_terms_matching_strategy(ctx, cost_of_term_idx))
}
pub fn removal_order_for_terms_matching_strategy_last( pub fn removal_order_for_terms_matching_strategy_last(
&self, &self,
ctx: &SearchContext, ctx: &SearchContext,
@ -315,10 +360,19 @@ impl QueryGraph {
if first_term_idx >= last_term_idx { if first_term_idx >= last_term_idx {
return vec![]; return vec![];
} }
let cost_of_term_idx = |term_idx: u8| { let cost_of_term_idx = |term_idx: u8| {
let rank = 1 + last_term_idx - term_idx; let rank = 1 + last_term_idx - term_idx;
rank as u16 rank as u16
}; };
self.removal_order_for_terms_matching_strategy(ctx, cost_of_term_idx)
}
pub fn removal_order_for_terms_matching_strategy(
&self,
ctx: &SearchContext,
order: impl Fn(u8) -> u16,
) -> Vec<SmallBitmap<QueryNode>> {
let mut nodes_to_remove = BTreeMap::<u16, SmallBitmap<QueryNode>>::new(); let mut nodes_to_remove = BTreeMap::<u16, SmallBitmap<QueryNode>>::new();
let mut at_least_one_mandatory_term = false; let mut at_least_one_mandatory_term = false;
for (node_id, node) in self.nodes.iter() { for (node_id, node) in self.nodes.iter() {
@ -329,7 +383,7 @@ impl QueryGraph {
} }
let mut cost = 0; let mut cost = 0;
for id in t.term_ids.clone() { for id in t.term_ids.clone() {
cost = std::cmp::max(cost, cost_of_term_idx(id)); cost = std::cmp::max(cost, order(id));
} }
nodes_to_remove nodes_to_remove
.entry(cost) .entry(cost)

View File

@ -159,6 +159,7 @@ pub fn expected_order(
match optional_words { match optional_words {
TermsMatchingStrategy::Last => groups.into_iter().flatten().collect(), TermsMatchingStrategy::Last => groups.into_iter().flatten().collect(),
TermsMatchingStrategy::Frequency => groups.into_iter().flatten().collect(),
TermsMatchingStrategy::All => { TermsMatchingStrategy::All => {
groups.into_iter().flatten().filter(|d| d.word_rank == 0).collect() groups.into_iter().flatten().filter(|d| d.word_rank == 0).collect()
} }