Fix bug in the proximity ranking rule for queries with ngrams

This commit is contained in:
Loïc Lecrenier 2023-03-15 12:52:40 +01:00
parent e9cf58d584
commit c0cdaf9f53
9 changed files with 132 additions and 68 deletions

View File

@ -36,6 +36,8 @@ That is we find the documents where either:
- OR: `pretty` is 2-close to `house` AND `house` is 1-close to `by`
*/
use std::ops::ControlFlow;
use roaring::RoaringBitmap;
use super::interner::MappedInterner;
@ -263,7 +265,7 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase
graph.remove_edges_with_condition(condition);
// 3. Also remove the entry from the edge_docids_cache, since we don't need it anymore
edge_docids_cache.cache.remove(&condition);
return Ok(());
return Ok(ControlFlow::Continue(()));
}
path_docids &= edge_docids;
@ -287,14 +289,18 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase
}
// We should maybe instead try to compute:
// 0th & nth & 1st & n-1th & 2nd & etc...
return Ok(());
return Ok(ControlFlow::Continue(()));
}
}
bucket |= &path_docids;
// Reduce the size of the universe so that we can more optimistically discard candidate paths
universe -= path_docids;
// TODO: if the universe is empty, stop iterating
Ok(())
if universe.is_empty() {
Ok(ControlFlow::Break(()))
} else {
Ok(ControlFlow::Continue(()))
}
},
)?;

View File

@ -10,7 +10,7 @@ use crate::search::new::interner::{Interned, MappedInterner};
use crate::search::new::query_graph::QueryNodeData;
use crate::search::new::query_term::{LocatedQueryTerm, QueryTerm};
use crate::search::new::ranking_rule_graph::{
DeadEndPathCache, Edge, EdgeCondition, ProximityEdge, ProximityGraph, RankingRuleGraph,
DeadEndPathCache, Edge, EdgeCondition, ProximityCondition, ProximityGraph, RankingRuleGraph,
RankingRuleGraphTrait, TypoEdge, TypoGraph,
};
use crate::search::new::small_bitmap::SmallBitmap;
@ -46,7 +46,7 @@ pub enum SearchEvents {
paths: Vec<Vec<u16>>,
empty_paths_cache: DeadEndPathCache<ProximityGraph>,
universe: RoaringBitmap,
distances: MappedInterner<Vec<(u16, SmallBitmap<ProximityEdge>)>, QueryNode>,
distances: MappedInterner<Vec<(u16, SmallBitmap<ProximityCondition>)>, QueryNode>,
cost: u16,
},
TypoState {
@ -172,7 +172,7 @@ impl SearchLogger<QueryGraph> for DetailedSearchLogger {
paths_map: &[Vec<u16>],
empty_paths_cache: &DeadEndPathCache<ProximityGraph>,
universe: &RoaringBitmap,
distances: &MappedInterner<Vec<(u16, SmallBitmap<ProximityEdge>)>, QueryNode>,
distances: &MappedInterner<Vec<(u16, SmallBitmap<ProximityCondition>)>, QueryNode>,
cost: u16,
) {
self.events.push(SearchEvents::ProximityState {

View File

@ -6,7 +6,7 @@ use roaring::RoaringBitmap;
use super::interner::MappedInterner;
use super::query_graph::QueryNode;
use super::ranking_rule_graph::{
DeadEndPathCache, ProximityEdge, ProximityGraph, RankingRuleGraph, TypoEdge, TypoGraph,
DeadEndPathCache, ProximityCondition, ProximityGraph, RankingRuleGraph, TypoEdge, TypoGraph,
};
use super::small_bitmap::SmallBitmap;
use super::{RankingRule, RankingRuleQueryTrait};
@ -68,7 +68,7 @@ pub trait SearchLogger<Q: RankingRuleQueryTrait> {
paths: &[Vec<u16>],
empty_paths_cache: &DeadEndPathCache<ProximityGraph>,
universe: &RoaringBitmap,
distances: &MappedInterner<Vec<(u16, SmallBitmap<ProximityEdge>)>, QueryNode>,
distances: &MappedInterner<Vec<(u16, SmallBitmap<ProximityCondition>)>, QueryNode>,
cost: u16,
);
@ -139,7 +139,7 @@ impl<Q: RankingRuleQueryTrait> SearchLogger<Q> for DefaultSearchLogger {
_paths_map: &[Vec<u16>],
_empty_paths_cache: &DeadEndPathCache<ProximityGraph>,
_universe: &RoaringBitmap,
_distances: &MappedInterner<Vec<(u16, SmallBitmap<ProximityEdge>)>, QueryNode>,
_distances: &MappedInterner<Vec<(u16, SmallBitmap<ProximityCondition>)>, QueryNode>,
_cost: u16,
) {
}

View File

@ -303,16 +303,16 @@ mod tests {
let mut ctx = SearchContext::new(&index, &txn);
let results = execute_search(
&mut ctx,
// "which a the releases from poison by the government",
"releases from poison by the government",
// "sun flower s are the best",
"zero config",
// "zero config",
TermsMatchingStrategy::Last,
None,
0,
20,
&mut DefaultSearchLogger,
&mut DefaultSearchLogger,
//&mut logger,
// &mut logger,
)
.unwrap();
@ -359,9 +359,9 @@ mod tests {
let start = Instant::now();
let mut s = Search::new(&txn, &index);
s.query("which a the releases from poison by the government");
s.query("releases from poison by the government");
s.terms_matching_strategy(TermsMatchingStrategy::Last);
s.criterion_implementation_strategy(crate::CriterionImplementationStrategy::OnlySetBased);
// s.criterion_implementation_strategy(crate::CriterionImplementationStrategy::OnlySetBased);
let docs = s.execute().unwrap();
let elapsed = start.elapsed();

View File

@ -2,6 +2,7 @@
use std::collections::btree_map::Entry;
use std::collections::{BTreeMap, VecDeque};
use std::ops::ControlFlow;
use super::empty_paths_cache::DeadEndPathCache;
use super::{EdgeCondition, RankingRuleGraph, RankingRuleGraphTrait};
@ -23,7 +24,7 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
cost: u16,
all_distances: &MappedInterner<Vec<(u16, SmallBitmap<G::EdgeCondition>)>, QueryNode>,
empty_paths_cache: &mut DeadEndPathCache<G>,
mut visit: impl FnMut(&[u16], &mut Self, &mut DeadEndPathCache<G>) -> Result<()>,
mut visit: impl FnMut(&[u16], &mut Self, &mut DeadEndPathCache<G>) -> Result<ControlFlow<()>>,
) -> Result<()> {
let _ = self.visit_paths_of_cost_rec(
from,
@ -43,7 +44,7 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
cost: u16,
all_distances: &MappedInterner<Vec<(u16, SmallBitmap<G::EdgeCondition>)>, QueryNode>,
empty_paths_cache: &mut DeadEndPathCache<G>,
visit: &mut impl FnMut(&[u16], &mut Self, &mut DeadEndPathCache<G>) -> Result<()>,
visit: &mut impl FnMut(&[u16], &mut Self, &mut DeadEndPathCache<G>) -> Result<ControlFlow<()>>,
prev_conditions: &mut Vec<u16>,
cur_path: &mut SmallBitmap<G::EdgeCondition>,
forbidden_conditions: &mut SmallBitmap<G::EdgeCondition>,
@ -60,7 +61,11 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
EdgeCondition::Unconditional => {
if edge.dest_node == self.query_graph.end_node {
any_valid = true;
visit(prev_conditions, self, empty_paths_cache)?;
let control_flow = visit(prev_conditions, self, empty_paths_cache)?;
match control_flow {
ControlFlow::Continue(_) => {}
ControlFlow::Break(_) => return Ok(true),
}
true
} else {
self.visit_paths_of_cost_rec(
@ -101,7 +106,11 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
);
let next_any_valid = if edge.dest_node == self.query_graph.end_node {
any_valid = true;
visit(prev_conditions, self, empty_paths_cache)?;
let control_flow = visit(prev_conditions, self, empty_paths_cache)?;
match control_flow {
ControlFlow::Continue(_) => {}
ControlFlow::Break(_) => return Ok(true),
}
true
} else {
self.visit_paths_of_cost_rec(

View File

@ -20,7 +20,7 @@ use std::hash::Hash;
pub use edge_docids_cache::EdgeConditionDocIdsCache;
pub use empty_paths_cache::DeadEndPathCache;
pub use proximity::{ProximityEdge, ProximityGraph};
pub use proximity::{ProximityCondition, ProximityGraph};
use roaring::RoaringBitmap;
pub use typo::{TypoEdge, TypoGraph};

View File

@ -1,7 +1,7 @@
#![allow(clippy::too_many_arguments)]
use std::collections::BTreeMap;
use super::ProximityEdge;
use super::ProximityCondition;
use crate::search::new::db_cache::DatabaseCache;
use crate::search::new::interner::{DedupInterner, Interned};
use crate::search::new::query_graph::QueryNodeData;
@ -37,10 +37,10 @@ fn first_word_of_term_iter<'t>(
pub fn build_edges<'ctx>(
ctx: &mut SearchContext<'ctx>,
conditions_interner: &mut DedupInterner<ProximityEdge>,
conditions_interner: &mut DedupInterner<ProximityCondition>,
from_node: &QueryNode,
to_node: &QueryNode,
) -> Result<Vec<(u8, EdgeCondition<ProximityEdge>)>> {
) -> Result<Vec<(u8, EdgeCondition<ProximityCondition>)>> {
let SearchContext {
index,
txn,
@ -51,24 +51,33 @@ pub fn build_edges<'ctx>(
term_docids: _,
} = ctx;
let (left_term, left_end_position) = match &from_node.data {
QueryNodeData::Term(LocatedQueryTerm { value, positions }) => {
(term_interner.get(*value), *positions.end())
}
QueryNodeData::Deleted => return Ok(vec![]),
QueryNodeData::Start => return Ok(vec![(0, EdgeCondition::Unconditional)]),
QueryNodeData::End => return Ok(vec![]),
};
let right_term = match &to_node.data {
QueryNodeData::End => return Ok(vec![(0, EdgeCondition::Unconditional)]),
QueryNodeData::Deleted | QueryNodeData::Start => return Ok(vec![]),
QueryNodeData::Term(term) => term,
};
let LocatedQueryTerm { value: right_value, positions: right_positions } = right_term;
let LocatedQueryTerm { value: right_term_interned, positions: right_positions } = right_term;
let (right_term, right_start_position, right_ngram_length) =
(term_interner.get(*right_value), *right_positions.start(), right_positions.len());
(term_interner.get(*right_term_interned), *right_positions.start(), right_positions.len());
let (left_term, left_end_position) = match &from_node.data {
QueryNodeData::Term(LocatedQueryTerm { value, positions }) => {
(term_interner.get(*value), *positions.end())
}
QueryNodeData::Deleted => return Ok(vec![]),
QueryNodeData::Start => {
return Ok(vec![(
(right_ngram_length - 1) as u8,
EdgeCondition::Conditional(
conditions_interner
.insert(ProximityCondition::Term { term: *right_term_interned }),
),
)])
}
QueryNodeData::End => return Ok(vec![]),
};
if left_end_position + 1 != right_start_position {
// We want to ignore this pair of terms
@ -77,7 +86,12 @@ pub fn build_edges<'ctx>(
// `flowers` is removed by the `words` ranking rule.
// The remaining query graph represents `the sun .. are beautiful`
// but `sun` and `are` have no proximity condition between them
return Ok(vec![(0, EdgeCondition::Unconditional)]);
return Ok(vec![(
(right_ngram_length - 1) as u8,
EdgeCondition::Conditional(
conditions_interner.insert(ProximityCondition::Term { term: *right_term_interned }),
),
)]);
}
let mut cost_proximity_word_pairs = BTreeMap::<u8, BTreeMap<u8, Vec<WordPair>>>::new();
@ -121,24 +135,30 @@ pub fn build_edges<'ctx>(
}
}
let mut new_edges =
cost_proximity_word_pairs
.into_iter()
.flat_map(|(cost, proximity_word_pairs)| {
let mut edges = vec![];
for (proximity, word_pairs) in proximity_word_pairs {
edges.push((
cost,
EdgeCondition::Conditional(conditions_interner.insert(ProximityEdge {
let mut new_edges = cost_proximity_word_pairs
.into_iter()
.flat_map(|(cost, proximity_word_pairs)| {
let mut edges = vec![];
for (proximity, word_pairs) in proximity_word_pairs {
edges.push((
cost,
EdgeCondition::Conditional(conditions_interner.insert(
ProximityCondition::Pairs {
pairs: word_pairs.into_boxed_slice(),
proximity,
})),
))
}
edges
})
.collect::<Vec<_>>();
new_edges.push((8 + (right_ngram_length - 1) as u8, EdgeCondition::Unconditional));
},
)),
))
}
edges
})
.collect::<Vec<_>>();
new_edges.push((
8 + (right_ngram_length - 1) as u8,
EdgeCondition::Conditional(
conditions_interner.insert(ProximityCondition::Term { term: *right_term_interned }),
),
));
Ok(new_edges)
}

View File

@ -1,16 +1,39 @@
use roaring::RoaringBitmap;
use super::{ProximityEdge, WordPair};
use super::{ProximityCondition, WordPair};
use crate::search::new::SearchContext;
use crate::{CboRoaringBitmapCodec, Result};
pub fn compute_docids<'ctx>(
ctx: &mut SearchContext<'ctx>,
edge: &ProximityEdge,
edge: &ProximityCondition,
universe: &RoaringBitmap,
) -> Result<RoaringBitmap> {
let SearchContext { index, txn, db_cache, word_interner, .. } = ctx;
let ProximityEdge { pairs, proximity } = edge;
let SearchContext {
index,
txn,
db_cache,
word_interner,
term_docids,
phrase_interner,
term_interner,
} = ctx;
let (pairs, proximity) = match edge {
ProximityCondition::Term { term } => {
return term_docids
.get_query_term_docids(
index,
txn,
db_cache,
word_interner,
term_interner,
phrase_interner,
*term,
)
.cloned()
}
ProximityCondition::Pairs { pairs, proximity } => (pairs, proximity),
};
let mut pair_docids = RoaringBitmap::new();
for pair in pairs.iter() {
let pair = match pair {

View File

@ -4,15 +4,15 @@ pub mod compute_docids;
use roaring::RoaringBitmap;
use super::empty_paths_cache::DeadEndPathCache;
use super::{EdgeCondition, RankingRuleGraphTrait};
use super::{EdgeCondition, RankingRuleGraph, RankingRuleGraphTrait};
use crate::search::new::interner::{DedupInterner, Interned, MappedInterner};
use crate::search::new::logger::SearchLogger;
use crate::search::new::query_term::Phrase;
use crate::search::new::query_term::{Phrase, QueryTerm};
use crate::search::new::small_bitmap::SmallBitmap;
use crate::search::new::{QueryGraph, QueryNode, SearchContext};
use crate::Result;
#[derive(Clone, PartialEq, Eq, Hash)]
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub enum WordPair {
Words {
phrases: Vec<Interned<Phrase>>,
@ -31,27 +31,33 @@ pub enum WordPair {
}
#[derive(Clone, PartialEq, Eq, Hash)]
pub struct ProximityEdge {
pairs: Box<[WordPair]>,
proximity: u8,
pub enum ProximityCondition {
Term { term: Interned<QueryTerm> },
Pairs { pairs: Box<[WordPair]>, proximity: u8 },
}
pub enum ProximityGraph {}
impl RankingRuleGraphTrait for ProximityGraph {
type EdgeCondition = ProximityEdge;
type EdgeCondition = ProximityCondition;
fn label_for_edge_condition(edge: &Self::EdgeCondition) -> String {
let ProximityEdge { pairs, proximity } = edge;
format!(", prox {proximity}, {} pairs", pairs.len())
match edge {
ProximityCondition::Term { term } => {
format!("term {term}")
}
ProximityCondition::Pairs { pairs, proximity } => {
format!("prox {proximity}, {} pairs", pairs.len())
}
}
}
fn resolve_edge_condition<'ctx>(
ctx: &mut SearchContext<'ctx>,
edge: &Self::EdgeCondition,
condition: &Self::EdgeCondition,
universe: &RoaringBitmap,
) -> Result<roaring::RoaringBitmap> {
compute_docids::compute_docids(ctx, edge, universe)
compute_docids::compute_docids(ctx, condition, universe)
}
fn build_edges<'ctx>(
@ -64,11 +70,11 @@ impl RankingRuleGraphTrait for ProximityGraph {
}
fn log_state(
graph: &super::RankingRuleGraph<Self>,
graph: &RankingRuleGraph<Self>,
paths: &[Vec<u16>],
empty_paths_cache: &DeadEndPathCache<Self>,
universe: &RoaringBitmap,
distances: &MappedInterner<Vec<(u16, SmallBitmap<ProximityEdge>)>, QueryNode>,
distances: &MappedInterner<Vec<(u16, SmallBitmap<ProximityCondition>)>, QueryNode>,
cost: u16,
logger: &mut dyn SearchLogger<QueryGraph>,
) {