Fix bug in the proximity ranking rule for queries with ngrams

This commit is contained in:
Loïc Lecrenier 2023-03-15 12:52:40 +01:00
parent e9cf58d584
commit c0cdaf9f53
9 changed files with 132 additions and 68 deletions

View File

@ -36,6 +36,8 @@ That is we find the documents where either:
- OR: `pretty` is 2-close to `house` AND `house` is 1-close to `by` - OR: `pretty` is 2-close to `house` AND `house` is 1-close to `by`
*/ */
use std::ops::ControlFlow;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use super::interner::MappedInterner; use super::interner::MappedInterner;
@ -263,7 +265,7 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase
graph.remove_edges_with_condition(condition); graph.remove_edges_with_condition(condition);
// 3. Also remove the entry from the edge_docids_cache, since we don't need it anymore // 3. Also remove the entry from the edge_docids_cache, since we don't need it anymore
edge_docids_cache.cache.remove(&condition); edge_docids_cache.cache.remove(&condition);
return Ok(()); return Ok(ControlFlow::Continue(()));
} }
path_docids &= edge_docids; path_docids &= edge_docids;
@ -287,14 +289,18 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase
} }
// We should maybe instead try to compute: // We should maybe instead try to compute:
// 0th & nth & 1st & n-1th & 2nd & etc... // 0th & nth & 1st & n-1th & 2nd & etc...
return Ok(()); return Ok(ControlFlow::Continue(()));
} }
} }
bucket |= &path_docids; bucket |= &path_docids;
// Reduce the size of the universe so that we can more optimistically discard candidate paths // Reduce the size of the universe so that we can more optimistically discard candidate paths
universe -= path_docids; universe -= path_docids;
// TODO: if the universe is empty, stop iterating
Ok(()) if universe.is_empty() {
Ok(ControlFlow::Break(()))
} else {
Ok(ControlFlow::Continue(()))
}
}, },
)?; )?;

View File

@ -10,7 +10,7 @@ use crate::search::new::interner::{Interned, MappedInterner};
use crate::search::new::query_graph::QueryNodeData; use crate::search::new::query_graph::QueryNodeData;
use crate::search::new::query_term::{LocatedQueryTerm, QueryTerm}; use crate::search::new::query_term::{LocatedQueryTerm, QueryTerm};
use crate::search::new::ranking_rule_graph::{ use crate::search::new::ranking_rule_graph::{
DeadEndPathCache, Edge, EdgeCondition, ProximityEdge, ProximityGraph, RankingRuleGraph, DeadEndPathCache, Edge, EdgeCondition, ProximityCondition, ProximityGraph, RankingRuleGraph,
RankingRuleGraphTrait, TypoEdge, TypoGraph, RankingRuleGraphTrait, TypoEdge, TypoGraph,
}; };
use crate::search::new::small_bitmap::SmallBitmap; use crate::search::new::small_bitmap::SmallBitmap;
@ -46,7 +46,7 @@ pub enum SearchEvents {
paths: Vec<Vec<u16>>, paths: Vec<Vec<u16>>,
empty_paths_cache: DeadEndPathCache<ProximityGraph>, empty_paths_cache: DeadEndPathCache<ProximityGraph>,
universe: RoaringBitmap, universe: RoaringBitmap,
distances: MappedInterner<Vec<(u16, SmallBitmap<ProximityEdge>)>, QueryNode>, distances: MappedInterner<Vec<(u16, SmallBitmap<ProximityCondition>)>, QueryNode>,
cost: u16, cost: u16,
}, },
TypoState { TypoState {
@ -172,7 +172,7 @@ impl SearchLogger<QueryGraph> for DetailedSearchLogger {
paths_map: &[Vec<u16>], paths_map: &[Vec<u16>],
empty_paths_cache: &DeadEndPathCache<ProximityGraph>, empty_paths_cache: &DeadEndPathCache<ProximityGraph>,
universe: &RoaringBitmap, universe: &RoaringBitmap,
distances: &MappedInterner<Vec<(u16, SmallBitmap<ProximityEdge>)>, QueryNode>, distances: &MappedInterner<Vec<(u16, SmallBitmap<ProximityCondition>)>, QueryNode>,
cost: u16, cost: u16,
) { ) {
self.events.push(SearchEvents::ProximityState { self.events.push(SearchEvents::ProximityState {

View File

@ -6,7 +6,7 @@ use roaring::RoaringBitmap;
use super::interner::MappedInterner; use super::interner::MappedInterner;
use super::query_graph::QueryNode; use super::query_graph::QueryNode;
use super::ranking_rule_graph::{ use super::ranking_rule_graph::{
DeadEndPathCache, ProximityEdge, ProximityGraph, RankingRuleGraph, TypoEdge, TypoGraph, DeadEndPathCache, ProximityCondition, ProximityGraph, RankingRuleGraph, TypoEdge, TypoGraph,
}; };
use super::small_bitmap::SmallBitmap; use super::small_bitmap::SmallBitmap;
use super::{RankingRule, RankingRuleQueryTrait}; use super::{RankingRule, RankingRuleQueryTrait};
@ -68,7 +68,7 @@ pub trait SearchLogger<Q: RankingRuleQueryTrait> {
paths: &[Vec<u16>], paths: &[Vec<u16>],
empty_paths_cache: &DeadEndPathCache<ProximityGraph>, empty_paths_cache: &DeadEndPathCache<ProximityGraph>,
universe: &RoaringBitmap, universe: &RoaringBitmap,
distances: &MappedInterner<Vec<(u16, SmallBitmap<ProximityEdge>)>, QueryNode>, distances: &MappedInterner<Vec<(u16, SmallBitmap<ProximityCondition>)>, QueryNode>,
cost: u16, cost: u16,
); );
@ -139,7 +139,7 @@ impl<Q: RankingRuleQueryTrait> SearchLogger<Q> for DefaultSearchLogger {
_paths_map: &[Vec<u16>], _paths_map: &[Vec<u16>],
_empty_paths_cache: &DeadEndPathCache<ProximityGraph>, _empty_paths_cache: &DeadEndPathCache<ProximityGraph>,
_universe: &RoaringBitmap, _universe: &RoaringBitmap,
_distances: &MappedInterner<Vec<(u16, SmallBitmap<ProximityEdge>)>, QueryNode>, _distances: &MappedInterner<Vec<(u16, SmallBitmap<ProximityCondition>)>, QueryNode>,
_cost: u16, _cost: u16,
) { ) {
} }

View File

@ -303,16 +303,16 @@ mod tests {
let mut ctx = SearchContext::new(&index, &txn); let mut ctx = SearchContext::new(&index, &txn);
let results = execute_search( let results = execute_search(
&mut ctx, &mut ctx,
// "which a the releases from poison by the government", "releases from poison by the government",
// "sun flower s are the best", // "sun flower s are the best",
"zero config", // "zero config",
TermsMatchingStrategy::Last, TermsMatchingStrategy::Last,
None, None,
0, 0,
20, 20,
&mut DefaultSearchLogger, &mut DefaultSearchLogger,
&mut DefaultSearchLogger, &mut DefaultSearchLogger,
//&mut logger, // &mut logger,
) )
.unwrap(); .unwrap();
@ -359,9 +359,9 @@ mod tests {
let start = Instant::now(); let start = Instant::now();
let mut s = Search::new(&txn, &index); let mut s = Search::new(&txn, &index);
s.query("which a the releases from poison by the government"); s.query("releases from poison by the government");
s.terms_matching_strategy(TermsMatchingStrategy::Last); s.terms_matching_strategy(TermsMatchingStrategy::Last);
s.criterion_implementation_strategy(crate::CriterionImplementationStrategy::OnlySetBased); // s.criterion_implementation_strategy(crate::CriterionImplementationStrategy::OnlySetBased);
let docs = s.execute().unwrap(); let docs = s.execute().unwrap();
let elapsed = start.elapsed(); let elapsed = start.elapsed();

View File

@ -2,6 +2,7 @@
use std::collections::btree_map::Entry; use std::collections::btree_map::Entry;
use std::collections::{BTreeMap, VecDeque}; use std::collections::{BTreeMap, VecDeque};
use std::ops::ControlFlow;
use super::empty_paths_cache::DeadEndPathCache; use super::empty_paths_cache::DeadEndPathCache;
use super::{EdgeCondition, RankingRuleGraph, RankingRuleGraphTrait}; use super::{EdgeCondition, RankingRuleGraph, RankingRuleGraphTrait};
@ -23,7 +24,7 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
cost: u16, cost: u16,
all_distances: &MappedInterner<Vec<(u16, SmallBitmap<G::EdgeCondition>)>, QueryNode>, all_distances: &MappedInterner<Vec<(u16, SmallBitmap<G::EdgeCondition>)>, QueryNode>,
empty_paths_cache: &mut DeadEndPathCache<G>, empty_paths_cache: &mut DeadEndPathCache<G>,
mut visit: impl FnMut(&[u16], &mut Self, &mut DeadEndPathCache<G>) -> Result<()>, mut visit: impl FnMut(&[u16], &mut Self, &mut DeadEndPathCache<G>) -> Result<ControlFlow<()>>,
) -> Result<()> { ) -> Result<()> {
let _ = self.visit_paths_of_cost_rec( let _ = self.visit_paths_of_cost_rec(
from, from,
@ -43,7 +44,7 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
cost: u16, cost: u16,
all_distances: &MappedInterner<Vec<(u16, SmallBitmap<G::EdgeCondition>)>, QueryNode>, all_distances: &MappedInterner<Vec<(u16, SmallBitmap<G::EdgeCondition>)>, QueryNode>,
empty_paths_cache: &mut DeadEndPathCache<G>, empty_paths_cache: &mut DeadEndPathCache<G>,
visit: &mut impl FnMut(&[u16], &mut Self, &mut DeadEndPathCache<G>) -> Result<()>, visit: &mut impl FnMut(&[u16], &mut Self, &mut DeadEndPathCache<G>) -> Result<ControlFlow<()>>,
prev_conditions: &mut Vec<u16>, prev_conditions: &mut Vec<u16>,
cur_path: &mut SmallBitmap<G::EdgeCondition>, cur_path: &mut SmallBitmap<G::EdgeCondition>,
forbidden_conditions: &mut SmallBitmap<G::EdgeCondition>, forbidden_conditions: &mut SmallBitmap<G::EdgeCondition>,
@ -60,7 +61,11 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
EdgeCondition::Unconditional => { EdgeCondition::Unconditional => {
if edge.dest_node == self.query_graph.end_node { if edge.dest_node == self.query_graph.end_node {
any_valid = true; any_valid = true;
visit(prev_conditions, self, empty_paths_cache)?; let control_flow = visit(prev_conditions, self, empty_paths_cache)?;
match control_flow {
ControlFlow::Continue(_) => {}
ControlFlow::Break(_) => return Ok(true),
}
true true
} else { } else {
self.visit_paths_of_cost_rec( self.visit_paths_of_cost_rec(
@ -101,7 +106,11 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
); );
let next_any_valid = if edge.dest_node == self.query_graph.end_node { let next_any_valid = if edge.dest_node == self.query_graph.end_node {
any_valid = true; any_valid = true;
visit(prev_conditions, self, empty_paths_cache)?; let control_flow = visit(prev_conditions, self, empty_paths_cache)?;
match control_flow {
ControlFlow::Continue(_) => {}
ControlFlow::Break(_) => return Ok(true),
}
true true
} else { } else {
self.visit_paths_of_cost_rec( self.visit_paths_of_cost_rec(

View File

@ -20,7 +20,7 @@ use std::hash::Hash;
pub use edge_docids_cache::EdgeConditionDocIdsCache; pub use edge_docids_cache::EdgeConditionDocIdsCache;
pub use empty_paths_cache::DeadEndPathCache; pub use empty_paths_cache::DeadEndPathCache;
pub use proximity::{ProximityEdge, ProximityGraph}; pub use proximity::{ProximityCondition, ProximityGraph};
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
pub use typo::{TypoEdge, TypoGraph}; pub use typo::{TypoEdge, TypoGraph};

View File

@ -1,7 +1,7 @@
#![allow(clippy::too_many_arguments)] #![allow(clippy::too_many_arguments)]
use std::collections::BTreeMap; use std::collections::BTreeMap;
use super::ProximityEdge; use super::ProximityCondition;
use crate::search::new::db_cache::DatabaseCache; use crate::search::new::db_cache::DatabaseCache;
use crate::search::new::interner::{DedupInterner, Interned}; use crate::search::new::interner::{DedupInterner, Interned};
use crate::search::new::query_graph::QueryNodeData; use crate::search::new::query_graph::QueryNodeData;
@ -37,10 +37,10 @@ fn first_word_of_term_iter<'t>(
pub fn build_edges<'ctx>( pub fn build_edges<'ctx>(
ctx: &mut SearchContext<'ctx>, ctx: &mut SearchContext<'ctx>,
conditions_interner: &mut DedupInterner<ProximityEdge>, conditions_interner: &mut DedupInterner<ProximityCondition>,
from_node: &QueryNode, from_node: &QueryNode,
to_node: &QueryNode, to_node: &QueryNode,
) -> Result<Vec<(u8, EdgeCondition<ProximityEdge>)>> { ) -> Result<Vec<(u8, EdgeCondition<ProximityCondition>)>> {
let SearchContext { let SearchContext {
index, index,
txn, txn,
@ -51,24 +51,33 @@ pub fn build_edges<'ctx>(
term_docids: _, term_docids: _,
} = ctx; } = ctx;
let (left_term, left_end_position) = match &from_node.data {
QueryNodeData::Term(LocatedQueryTerm { value, positions }) => {
(term_interner.get(*value), *positions.end())
}
QueryNodeData::Deleted => return Ok(vec![]),
QueryNodeData::Start => return Ok(vec![(0, EdgeCondition::Unconditional)]),
QueryNodeData::End => return Ok(vec![]),
};
let right_term = match &to_node.data { let right_term = match &to_node.data {
QueryNodeData::End => return Ok(vec![(0, EdgeCondition::Unconditional)]), QueryNodeData::End => return Ok(vec![(0, EdgeCondition::Unconditional)]),
QueryNodeData::Deleted | QueryNodeData::Start => return Ok(vec![]), QueryNodeData::Deleted | QueryNodeData::Start => return Ok(vec![]),
QueryNodeData::Term(term) => term, QueryNodeData::Term(term) => term,
}; };
let LocatedQueryTerm { value: right_value, positions: right_positions } = right_term;
let LocatedQueryTerm { value: right_term_interned, positions: right_positions } = right_term;
let (right_term, right_start_position, right_ngram_length) = let (right_term, right_start_position, right_ngram_length) =
(term_interner.get(*right_value), *right_positions.start(), right_positions.len()); (term_interner.get(*right_term_interned), *right_positions.start(), right_positions.len());
let (left_term, left_end_position) = match &from_node.data {
QueryNodeData::Term(LocatedQueryTerm { value, positions }) => {
(term_interner.get(*value), *positions.end())
}
QueryNodeData::Deleted => return Ok(vec![]),
QueryNodeData::Start => {
return Ok(vec![(
(right_ngram_length - 1) as u8,
EdgeCondition::Conditional(
conditions_interner
.insert(ProximityCondition::Term { term: *right_term_interned }),
),
)])
}
QueryNodeData::End => return Ok(vec![]),
};
if left_end_position + 1 != right_start_position { if left_end_position + 1 != right_start_position {
// We want to ignore this pair of terms // We want to ignore this pair of terms
@ -77,7 +86,12 @@ pub fn build_edges<'ctx>(
// `flowers` is removed by the `words` ranking rule. // `flowers` is removed by the `words` ranking rule.
// The remaining query graph represents `the sun .. are beautiful` // The remaining query graph represents `the sun .. are beautiful`
// but `sun` and `are` have no proximity condition between them // but `sun` and `are` have no proximity condition between them
return Ok(vec![(0, EdgeCondition::Unconditional)]); return Ok(vec![(
(right_ngram_length - 1) as u8,
EdgeCondition::Conditional(
conditions_interner.insert(ProximityCondition::Term { term: *right_term_interned }),
),
)]);
} }
let mut cost_proximity_word_pairs = BTreeMap::<u8, BTreeMap<u8, Vec<WordPair>>>::new(); let mut cost_proximity_word_pairs = BTreeMap::<u8, BTreeMap<u8, Vec<WordPair>>>::new();
@ -121,24 +135,30 @@ pub fn build_edges<'ctx>(
} }
} }
let mut new_edges = let mut new_edges = cost_proximity_word_pairs
cost_proximity_word_pairs
.into_iter() .into_iter()
.flat_map(|(cost, proximity_word_pairs)| { .flat_map(|(cost, proximity_word_pairs)| {
let mut edges = vec![]; let mut edges = vec![];
for (proximity, word_pairs) in proximity_word_pairs { for (proximity, word_pairs) in proximity_word_pairs {
edges.push(( edges.push((
cost, cost,
EdgeCondition::Conditional(conditions_interner.insert(ProximityEdge { EdgeCondition::Conditional(conditions_interner.insert(
ProximityCondition::Pairs {
pairs: word_pairs.into_boxed_slice(), pairs: word_pairs.into_boxed_slice(),
proximity, proximity,
})), },
)),
)) ))
} }
edges edges
}) })
.collect::<Vec<_>>(); .collect::<Vec<_>>();
new_edges.push((8 + (right_ngram_length - 1) as u8, EdgeCondition::Unconditional)); new_edges.push((
8 + (right_ngram_length - 1) as u8,
EdgeCondition::Conditional(
conditions_interner.insert(ProximityCondition::Term { term: *right_term_interned }),
),
));
Ok(new_edges) Ok(new_edges)
} }

View File

@ -1,16 +1,39 @@
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use super::{ProximityEdge, WordPair}; use super::{ProximityCondition, WordPair};
use crate::search::new::SearchContext; use crate::search::new::SearchContext;
use crate::{CboRoaringBitmapCodec, Result}; use crate::{CboRoaringBitmapCodec, Result};
pub fn compute_docids<'ctx>( pub fn compute_docids<'ctx>(
ctx: &mut SearchContext<'ctx>, ctx: &mut SearchContext<'ctx>,
edge: &ProximityEdge, edge: &ProximityCondition,
universe: &RoaringBitmap, universe: &RoaringBitmap,
) -> Result<RoaringBitmap> { ) -> Result<RoaringBitmap> {
let SearchContext { index, txn, db_cache, word_interner, .. } = ctx; let SearchContext {
let ProximityEdge { pairs, proximity } = edge; index,
txn,
db_cache,
word_interner,
term_docids,
phrase_interner,
term_interner,
} = ctx;
let (pairs, proximity) = match edge {
ProximityCondition::Term { term } => {
return term_docids
.get_query_term_docids(
index,
txn,
db_cache,
word_interner,
term_interner,
phrase_interner,
*term,
)
.cloned()
}
ProximityCondition::Pairs { pairs, proximity } => (pairs, proximity),
};
let mut pair_docids = RoaringBitmap::new(); let mut pair_docids = RoaringBitmap::new();
for pair in pairs.iter() { for pair in pairs.iter() {
let pair = match pair { let pair = match pair {

View File

@ -4,15 +4,15 @@ pub mod compute_docids;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use super::empty_paths_cache::DeadEndPathCache; use super::empty_paths_cache::DeadEndPathCache;
use super::{EdgeCondition, RankingRuleGraphTrait}; use super::{EdgeCondition, RankingRuleGraph, RankingRuleGraphTrait};
use crate::search::new::interner::{DedupInterner, Interned, MappedInterner}; use crate::search::new::interner::{DedupInterner, Interned, MappedInterner};
use crate::search::new::logger::SearchLogger; use crate::search::new::logger::SearchLogger;
use crate::search::new::query_term::Phrase; use crate::search::new::query_term::{Phrase, QueryTerm};
use crate::search::new::small_bitmap::SmallBitmap; use crate::search::new::small_bitmap::SmallBitmap;
use crate::search::new::{QueryGraph, QueryNode, SearchContext}; use crate::search::new::{QueryGraph, QueryNode, SearchContext};
use crate::Result; use crate::Result;
#[derive(Clone, PartialEq, Eq, Hash)] #[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub enum WordPair { pub enum WordPair {
Words { Words {
phrases: Vec<Interned<Phrase>>, phrases: Vec<Interned<Phrase>>,
@ -31,27 +31,33 @@ pub enum WordPair {
} }
#[derive(Clone, PartialEq, Eq, Hash)] #[derive(Clone, PartialEq, Eq, Hash)]
pub struct ProximityEdge { pub enum ProximityCondition {
pairs: Box<[WordPair]>, Term { term: Interned<QueryTerm> },
proximity: u8, Pairs { pairs: Box<[WordPair]>, proximity: u8 },
} }
pub enum ProximityGraph {} pub enum ProximityGraph {}
impl RankingRuleGraphTrait for ProximityGraph { impl RankingRuleGraphTrait for ProximityGraph {
type EdgeCondition = ProximityEdge; type EdgeCondition = ProximityCondition;
fn label_for_edge_condition(edge: &Self::EdgeCondition) -> String { fn label_for_edge_condition(edge: &Self::EdgeCondition) -> String {
let ProximityEdge { pairs, proximity } = edge; match edge {
format!(", prox {proximity}, {} pairs", pairs.len()) ProximityCondition::Term { term } => {
format!("term {term}")
}
ProximityCondition::Pairs { pairs, proximity } => {
format!("prox {proximity}, {} pairs", pairs.len())
}
}
} }
fn resolve_edge_condition<'ctx>( fn resolve_edge_condition<'ctx>(
ctx: &mut SearchContext<'ctx>, ctx: &mut SearchContext<'ctx>,
edge: &Self::EdgeCondition, condition: &Self::EdgeCondition,
universe: &RoaringBitmap, universe: &RoaringBitmap,
) -> Result<roaring::RoaringBitmap> { ) -> Result<roaring::RoaringBitmap> {
compute_docids::compute_docids(ctx, edge, universe) compute_docids::compute_docids(ctx, condition, universe)
} }
fn build_edges<'ctx>( fn build_edges<'ctx>(
@ -64,11 +70,11 @@ impl RankingRuleGraphTrait for ProximityGraph {
} }
fn log_state( fn log_state(
graph: &super::RankingRuleGraph<Self>, graph: &RankingRuleGraph<Self>,
paths: &[Vec<u16>], paths: &[Vec<u16>],
empty_paths_cache: &DeadEndPathCache<Self>, empty_paths_cache: &DeadEndPathCache<Self>,
universe: &RoaringBitmap, universe: &RoaringBitmap,
distances: &MappedInterner<Vec<(u16, SmallBitmap<ProximityEdge>)>, QueryNode>, distances: &MappedInterner<Vec<(u16, SmallBitmap<ProximityCondition>)>, QueryNode>,
cost: u16, cost: u16,
logger: &mut dyn SearchLogger<QueryGraph>, logger: &mut dyn SearchLogger<QueryGraph>,
) { ) {