2023-03-08 13:26:29 +01:00
|
|
|
/*! Implementation of a generic graph-based ranking rule.
|
|
|
|
|
|
|
|
A graph-based ranking rule is a ranking rule that works by representing
|
|
|
|
its possible operations and their relevancy cost as a directed acyclic multi-graph
|
|
|
|
built on top of the query graph. It then computes its buckets by finding the
|
|
|
|
cheapest paths from the start node to the end node and computing the document ids
|
|
|
|
that satisfy those paths.
|
|
|
|
|
|
|
|
For example, the proximity ranking rule builds a graph where the edges between two
|
|
|
|
nodes represent a condition that the term of the source node is in a certain proximity
|
|
|
|
to the term of the destination node. With the query "pretty house by" where the term
|
|
|
|
"pretty" has three possible proximities to the term "house" and "house" has two
|
|
|
|
proximities to "by", the graph will look like this:
|
|
|
|
|
|
|
|
```txt
|
|
|
|
┌───────┐ ┌───────┐─────1────▶┌───────┐──1──▶┌─────┐ ┌───────┐
|
|
|
|
│ START │──0─▶│pretty │─────2────▶│ house │ │ by │─0─▶│ END │
|
|
|
|
└───────┘ └───────┘─────3────▶└───────┘──2-─▶└─────┘ └───────┘
|
|
|
|
```
|
|
|
|
The proximity ranking rule's first bucket will be determined by the union of all
|
|
|
|
the shortest paths from START to END, which in this case is:
|
|
|
|
```txt
|
|
|
|
START --0-> pretty --1--> house --1--> by --0--> end
|
|
|
|
```
|
|
|
|
The path's corresponding document ids are found by taking the intersection of the
|
|
|
|
document ids of each edge. That is, we find the documents where both `pretty` is
|
|
|
|
1-close to `house` AND `house` is 1-close to `by`.
|
|
|
|
|
|
|
|
For the second bucket, we get the union of the second-cheapest paths, which are:
|
|
|
|
```txt
|
|
|
|
START --0-> pretty --1--> house --2--> by --0--> end
|
|
|
|
START --0-> pretty --2--> house --1--> by --0--> end
|
|
|
|
```
|
|
|
|
That is we find the documents where either:
|
|
|
|
- `pretty` is 1-close to `house` AND `house` is 2-close to `by`
|
|
|
|
- OR: `pretty` is 2-close to `house` AND `house` is 1-close to `by`
|
|
|
|
*/
|
|
|
|
|
2023-03-15 16:08:43 +01:00
|
|
|
use std::collections::HashSet;
|
2023-03-15 12:52:40 +01:00
|
|
|
use std::ops::ControlFlow;
|
|
|
|
|
2023-03-08 09:55:53 +01:00
|
|
|
use roaring::RoaringBitmap;
|
|
|
|
|
2023-03-14 16:37:47 +01:00
|
|
|
use super::interner::MappedInterner;
|
2023-02-22 15:34:37 +01:00
|
|
|
use super::logger::SearchLogger;
|
2023-03-14 16:37:47 +01:00
|
|
|
use super::query_graph::QueryNode;
|
2023-03-08 09:55:53 +01:00
|
|
|
use super::ranking_rule_graph::{
|
2023-03-14 16:37:47 +01:00
|
|
|
DeadEndPathCache, EdgeCondition, EdgeConditionDocIdsCache, ProximityGraph, RankingRuleGraph,
|
2023-03-13 12:46:32 +01:00
|
|
|
RankingRuleGraphTrait, TypoGraph,
|
2023-03-08 09:55:53 +01:00
|
|
|
};
|
2023-03-08 09:53:05 +01:00
|
|
|
use super::small_bitmap::SmallBitmap;
|
2023-03-13 12:46:32 +01:00
|
|
|
use super::{QueryGraph, RankingRule, RankingRuleOutput, SearchContext};
|
2023-03-14 16:37:47 +01:00
|
|
|
use crate::search::new::interner::Interned;
|
2023-03-15 16:08:43 +01:00
|
|
|
use crate::search::new::query_graph::QueryNodeData;
|
2023-03-06 19:21:55 +01:00
|
|
|
use crate::Result;
|
2023-02-21 09:48:49 +01:00
|
|
|
|
2023-03-13 09:52:17 +01:00
|
|
|
pub type Proximity = GraphBasedRankingRule<ProximityGraph>;
|
|
|
|
impl Default for GraphBasedRankingRule<ProximityGraph> {
|
|
|
|
fn default() -> Self {
|
|
|
|
Self::new("proximity".to_owned())
|
|
|
|
}
|
|
|
|
}
|
|
|
|
pub type Typo = GraphBasedRankingRule<TypoGraph>;
|
|
|
|
impl Default for GraphBasedRankingRule<TypoGraph> {
|
|
|
|
fn default() -> Self {
|
|
|
|
Self::new("typo".to_owned())
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-03-08 13:26:29 +01:00
|
|
|
/// A generic graph-based ranking rule
|
2023-02-21 09:48:49 +01:00
|
|
|
pub struct GraphBasedRankingRule<G: RankingRuleGraphTrait> {
|
2023-02-22 15:34:37 +01:00
|
|
|
id: String,
|
2023-03-08 13:26:29 +01:00
|
|
|
// When the ranking rule is not iterating over its buckets,
|
|
|
|
// its state is `None`.
|
2023-02-21 09:48:49 +01:00
|
|
|
state: Option<GraphBasedRankingRuleState<G>>,
|
|
|
|
}
|
2023-02-22 15:34:37 +01:00
|
|
|
impl<G: RankingRuleGraphTrait> GraphBasedRankingRule<G> {
|
2023-03-08 13:26:29 +01:00
|
|
|
/// Creates the ranking rule with the given identifier
|
2023-02-22 15:34:37 +01:00
|
|
|
pub fn new(id: String) -> Self {
|
|
|
|
Self { id, state: None }
|
2023-02-21 09:48:49 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-03-08 13:26:29 +01:00
|
|
|
/// The internal state of a graph-based ranking rule during iteration
|
2023-02-21 09:48:49 +01:00
|
|
|
pub struct GraphBasedRankingRuleState<G: RankingRuleGraphTrait> {
|
2023-03-08 13:26:29 +01:00
|
|
|
/// The current graph
|
2023-02-21 09:48:49 +01:00
|
|
|
graph: RankingRuleGraph<G>,
|
2023-03-08 13:26:29 +01:00
|
|
|
/// Cache to retrieve the docids associated with each edge
|
2023-03-14 16:37:47 +01:00
|
|
|
edge_conditions_cache: EdgeConditionDocIdsCache<G>,
|
2023-03-08 13:26:29 +01:00
|
|
|
/// Cache used to optimistically discard paths that resolve to no documents.
|
2023-03-14 16:37:47 +01:00
|
|
|
empty_paths_cache: DeadEndPathCache<G>,
|
2023-03-08 13:26:29 +01:00
|
|
|
/// A structure giving the list of possible costs from each node to the end node,
|
|
|
|
/// along with a set of unavoidable edges that must be traversed to achieve that distance.
|
2023-03-14 16:37:47 +01:00
|
|
|
all_distances: MappedInterner<Vec<(u16, SmallBitmap<G::EdgeCondition>)>, QueryNode>,
|
2023-03-08 13:26:29 +01:00
|
|
|
/// An index in the first element of `all_distances`, giving the cost of the next bucket
|
2023-03-02 21:27:42 +01:00
|
|
|
cur_distance_idx: usize,
|
|
|
|
}
|
|
|
|
|
2023-03-08 13:26:29 +01:00
|
|
|
/// Traverse each edge of the graph, computes its associated document ids,
|
|
|
|
/// and remove this edge from the graph if its docids are disjoint with the
|
|
|
|
/// given universe.
|
2023-03-13 14:03:48 +01:00
|
|
|
fn remove_empty_edges<'ctx, G: RankingRuleGraphTrait>(
|
|
|
|
ctx: &mut SearchContext<'ctx>,
|
2023-03-02 21:27:42 +01:00
|
|
|
graph: &mut RankingRuleGraph<G>,
|
2023-03-14 16:37:47 +01:00
|
|
|
edge_docids_cache: &mut EdgeConditionDocIdsCache<G>,
|
2023-03-02 21:27:42 +01:00
|
|
|
universe: &RoaringBitmap,
|
2023-03-14 16:37:47 +01:00
|
|
|
empty_paths_cache: &mut DeadEndPathCache<G>,
|
2023-03-02 21:27:42 +01:00
|
|
|
) -> Result<()> {
|
2023-03-14 16:37:47 +01:00
|
|
|
for edge_id in graph.edges_store.indexes() {
|
|
|
|
let Some(edge) = graph.edges_store.get(edge_id).as_ref() else {
|
2023-03-02 21:27:42 +01:00
|
|
|
continue;
|
2023-03-13 12:46:32 +01:00
|
|
|
};
|
|
|
|
let condition = edge.condition;
|
|
|
|
|
|
|
|
match condition {
|
|
|
|
EdgeCondition::Unconditional => continue,
|
|
|
|
EdgeCondition::Conditional(condition) => {
|
|
|
|
let docids = edge_docids_cache.get_edge_docids(ctx, condition, graph, universe)?;
|
2023-03-07 14:42:58 +01:00
|
|
|
if docids.is_disjoint(universe) {
|
2023-03-14 16:37:47 +01:00
|
|
|
graph.remove_edges_with_condition(condition);
|
|
|
|
empty_paths_cache.add_condition(condition);
|
2023-03-13 12:46:32 +01:00
|
|
|
edge_docids_cache.cache.remove(&condition);
|
2023-03-02 21:27:42 +01:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
Ok(())
|
2023-02-21 09:48:49 +01:00
|
|
|
}
|
|
|
|
|
2023-03-13 14:03:48 +01:00
|
|
|
impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBasedRankingRule<G> {
|
2023-02-22 15:34:37 +01:00
|
|
|
fn id(&self) -> String {
|
|
|
|
self.id.clone()
|
|
|
|
}
|
2023-02-21 09:48:49 +01:00
|
|
|
fn start_iteration(
|
|
|
|
&mut self,
|
2023-03-13 14:03:48 +01:00
|
|
|
ctx: &mut SearchContext<'ctx>,
|
2023-02-28 11:49:24 +01:00
|
|
|
_logger: &mut dyn SearchLogger<QueryGraph>,
|
2023-03-02 21:27:42 +01:00
|
|
|
universe: &RoaringBitmap,
|
2023-02-21 09:48:49 +01:00
|
|
|
query_graph: &QueryGraph,
|
|
|
|
) -> Result<()> {
|
2023-03-06 19:21:55 +01:00
|
|
|
let mut graph = RankingRuleGraph::build(ctx, query_graph.clone())?;
|
2023-03-14 16:37:47 +01:00
|
|
|
let mut edge_docids_cache = EdgeConditionDocIdsCache::default();
|
|
|
|
let mut empty_paths_cache = DeadEndPathCache::new(&graph.conditions_interner);
|
2023-03-02 21:27:42 +01:00
|
|
|
|
2023-03-08 13:26:29 +01:00
|
|
|
// First simplify the graph as much as possible, by computing the docids of the edges
|
|
|
|
// within the rule's universe and removing the edges that have no associated docids.
|
2023-03-02 21:27:42 +01:00
|
|
|
remove_empty_edges(
|
2023-03-06 19:21:55 +01:00
|
|
|
ctx,
|
2023-03-02 21:27:42 +01:00
|
|
|
&mut graph,
|
|
|
|
&mut edge_docids_cache,
|
|
|
|
universe,
|
|
|
|
&mut empty_paths_cache,
|
|
|
|
)?;
|
2023-03-08 13:26:29 +01:00
|
|
|
|
|
|
|
// Then pre-compute the cost of all paths from each node to the end node
|
2023-03-08 09:53:05 +01:00
|
|
|
let all_distances = graph.initialize_distances_with_necessary_edges();
|
2023-02-21 09:48:49 +01:00
|
|
|
|
|
|
|
let state = GraphBasedRankingRuleState {
|
|
|
|
graph,
|
2023-03-13 12:46:32 +01:00
|
|
|
edge_conditions_cache: edge_docids_cache,
|
2023-03-02 21:27:42 +01:00
|
|
|
empty_paths_cache,
|
|
|
|
all_distances,
|
|
|
|
cur_distance_idx: 0,
|
2023-02-21 09:48:49 +01:00
|
|
|
};
|
|
|
|
|
|
|
|
self.state = Some(state);
|
|
|
|
|
|
|
|
Ok(())
|
|
|
|
}
|
|
|
|
|
|
|
|
fn next_bucket(
|
|
|
|
&mut self,
|
2023-03-13 14:03:48 +01:00
|
|
|
ctx: &mut SearchContext<'ctx>,
|
2023-02-22 15:34:37 +01:00
|
|
|
logger: &mut dyn SearchLogger<QueryGraph>,
|
2023-02-21 09:48:49 +01:00
|
|
|
universe: &RoaringBitmap,
|
|
|
|
) -> Result<Option<RankingRuleOutput<QueryGraph>>> {
|
2023-03-08 13:26:29 +01:00
|
|
|
// If universe.len() <= 1, the bucket sort algorithm
|
|
|
|
// should not have called this function.
|
2023-02-21 09:48:49 +01:00
|
|
|
assert!(universe.len() > 1);
|
2023-03-08 13:26:29 +01:00
|
|
|
// Will crash if `next_bucket` is called before `start_iteration` or after `end_iteration`,
|
|
|
|
// should never happen
|
2023-02-21 09:48:49 +01:00
|
|
|
let mut state = self.state.take().unwrap();
|
2023-03-08 09:53:05 +01:00
|
|
|
|
2023-03-13 12:46:32 +01:00
|
|
|
// TODO: does this have a real positive performance impact?
|
2023-03-02 21:27:42 +01:00
|
|
|
remove_empty_edges(
|
2023-03-06 19:21:55 +01:00
|
|
|
ctx,
|
2023-03-02 21:27:42 +01:00
|
|
|
&mut state.graph,
|
2023-03-13 12:46:32 +01:00
|
|
|
&mut state.edge_conditions_cache,
|
2023-03-02 21:27:42 +01:00
|
|
|
universe,
|
|
|
|
&mut state.empty_paths_cache,
|
|
|
|
)?;
|
2023-02-28 14:19:57 +01:00
|
|
|
|
2023-03-08 13:26:29 +01:00
|
|
|
// If the cur_distance_idx does not point to a valid cost in the `all_distances`
|
|
|
|
// structure, then we have computed all the buckets and can return.
|
2023-03-02 21:27:42 +01:00
|
|
|
if state.cur_distance_idx
|
2023-03-14 16:37:47 +01:00
|
|
|
>= state.all_distances.get(state.graph.query_graph.root_node).len()
|
2023-03-02 21:27:42 +01:00
|
|
|
{
|
|
|
|
self.state = None;
|
2023-02-28 14:19:57 +01:00
|
|
|
return Ok(None);
|
|
|
|
}
|
2023-03-08 13:26:29 +01:00
|
|
|
|
|
|
|
// Retrieve the cost of the paths to compute
|
2023-03-08 09:53:05 +01:00
|
|
|
let (cost, _) =
|
2023-03-14 16:37:47 +01:00
|
|
|
state.all_distances.get(state.graph.query_graph.root_node)[state.cur_distance_idx];
|
2023-03-02 21:27:42 +01:00
|
|
|
state.cur_distance_idx += 1;
|
|
|
|
|
2023-03-07 14:42:58 +01:00
|
|
|
let mut bucket = RoaringBitmap::new();
|
|
|
|
|
|
|
|
let GraphBasedRankingRuleState {
|
|
|
|
graph,
|
2023-03-13 12:46:32 +01:00
|
|
|
edge_conditions_cache: edge_docids_cache,
|
2023-03-07 14:42:58 +01:00
|
|
|
empty_paths_cache,
|
|
|
|
all_distances,
|
|
|
|
cur_distance_idx: _,
|
|
|
|
} = &mut state;
|
|
|
|
|
2023-03-13 14:03:48 +01:00
|
|
|
let original_universe = universe;
|
2023-03-07 14:42:58 +01:00
|
|
|
let mut universe = universe.clone();
|
|
|
|
|
2023-03-13 14:03:48 +01:00
|
|
|
let original_graph = graph.clone();
|
2023-03-15 16:08:43 +01:00
|
|
|
let mut used_conditions = SmallBitmap::for_interned_values_in(&graph.conditions_interner);
|
2023-03-08 13:26:29 +01:00
|
|
|
let mut paths = vec![];
|
|
|
|
|
|
|
|
// For each path of the given cost, we will compute its associated
|
|
|
|
// document ids.
|
|
|
|
// In case the path does not resolve to any document id, we try to figure out why
|
|
|
|
// and update the `empty_paths_cache` accordingly.
|
|
|
|
// For example, it may be that the path is empty because one of its edges is disjoint
|
|
|
|
// with the universe, or because a prefix of the path is disjoint with the universe, or because
|
|
|
|
// the path contains two edges that are disjoint from each other within the universe.
|
|
|
|
// Updating the empty_paths_cache helps speed up the execution of `visit_paths_of_cost` and reduces
|
|
|
|
// the number of future candidate paths given by that same function.
|
2023-03-07 14:42:58 +01:00
|
|
|
graph.visit_paths_of_cost(
|
2023-03-14 16:37:47 +01:00
|
|
|
graph.query_graph.root_node,
|
2023-03-02 21:27:42 +01:00
|
|
|
cost,
|
2023-03-07 14:42:58 +01:00
|
|
|
all_distances,
|
|
|
|
empty_paths_cache,
|
|
|
|
|path, graph, empty_paths_cache| {
|
2023-03-08 13:26:29 +01:00
|
|
|
// Accumulate the path for logging purposes only
|
2023-03-08 09:53:05 +01:00
|
|
|
paths.push(path.to_vec());
|
2023-03-07 14:42:58 +01:00
|
|
|
let mut path_docids = universe.clone();
|
2023-03-08 13:26:29 +01:00
|
|
|
|
|
|
|
// We store the edges and their docids in vectors in case the path turns out to be
|
|
|
|
// empty and we need to figure out why it was empty.
|
2023-03-14 16:37:47 +01:00
|
|
|
let mut visited_conditions = vec![];
|
2023-03-15 16:08:43 +01:00
|
|
|
let mut cached_edge_docids = vec![];
|
|
|
|
// graph.conditions_interner.map(|_| RoaringBitmap::new());
|
2023-03-08 13:26:29 +01:00
|
|
|
|
2023-03-14 16:37:47 +01:00
|
|
|
for &condition_interned_raw in path {
|
|
|
|
let condition = Interned::new(condition_interned_raw);
|
|
|
|
visited_conditions.push(condition_interned_raw);
|
2023-03-13 12:46:32 +01:00
|
|
|
|
|
|
|
let edge_docids =
|
|
|
|
edge_docids_cache.get_edge_docids(ctx, condition, graph, &universe)?;
|
|
|
|
|
2023-03-15 16:08:43 +01:00
|
|
|
cached_edge_docids.push((condition, edge_docids.clone())); // .get_mut(condition) = edge_docids.clone();
|
2023-03-08 13:26:29 +01:00
|
|
|
|
|
|
|
// If the edge is empty, then the path will be empty as well, we update the graph
|
|
|
|
// and caches accordingly and skip to the next candidate path.
|
2023-03-07 14:42:58 +01:00
|
|
|
if edge_docids.is_disjoint(&universe) {
|
|
|
|
// 1. Store in the cache that this edge is empty for this universe
|
2023-03-14 16:37:47 +01:00
|
|
|
empty_paths_cache.add_condition(condition);
|
2023-03-07 14:42:58 +01:00
|
|
|
// 2. remove this edge from the ranking rule graph
|
2023-03-14 16:37:47 +01:00
|
|
|
// ouch, no! :( need to link a condition to one or more ranking rule edges
|
|
|
|
graph.remove_edges_with_condition(condition);
|
2023-03-08 13:26:29 +01:00
|
|
|
// 3. Also remove the entry from the edge_docids_cache, since we don't need it anymore
|
2023-03-13 12:46:32 +01:00
|
|
|
edge_docids_cache.cache.remove(&condition);
|
2023-03-15 12:52:40 +01:00
|
|
|
return Ok(ControlFlow::Continue(()));
|
2023-03-07 14:42:58 +01:00
|
|
|
}
|
|
|
|
path_docids &= edge_docids;
|
|
|
|
|
2023-03-08 13:26:29 +01:00
|
|
|
// If the (sub)path is empty, we try to figure out why and update the caches accordingly.
|
2023-03-07 14:42:58 +01:00
|
|
|
if path_docids.is_disjoint(&universe) {
|
2023-03-08 13:26:29 +01:00
|
|
|
// First, we know that this path is empty, and thus any path
|
|
|
|
// that is a superset of it will also be empty.
|
2023-03-14 16:37:47 +01:00
|
|
|
empty_paths_cache.add_prefix(&visited_conditions);
|
2023-03-08 13:26:29 +01:00
|
|
|
// Second, if the intersection between this edge and any
|
2023-03-07 14:42:58 +01:00
|
|
|
// previous one is disjoint with the universe,
|
2023-03-08 13:26:29 +01:00
|
|
|
// then we also know that any path containing the same couple of
|
|
|
|
// edges will also be empty.
|
2023-03-14 16:37:47 +01:00
|
|
|
for (past_condition, edge_docids2) in cached_edge_docids.iter() {
|
2023-03-15 16:08:43 +01:00
|
|
|
if *past_condition == condition {
|
2023-03-14 16:37:47 +01:00
|
|
|
continue;
|
|
|
|
};
|
2023-03-07 14:42:58 +01:00
|
|
|
let intersection = edge_docids & edge_docids2;
|
|
|
|
if intersection.is_disjoint(&universe) {
|
2023-03-15 16:08:43 +01:00
|
|
|
empty_paths_cache.add_condition_couple(*past_condition, condition);
|
2023-03-07 14:42:58 +01:00
|
|
|
}
|
|
|
|
}
|
2023-03-13 12:46:32 +01:00
|
|
|
// We should maybe instead try to compute:
|
|
|
|
// 0th & nth & 1st & n-1th & 2nd & etc...
|
2023-03-15 12:52:40 +01:00
|
|
|
return Ok(ControlFlow::Continue(()));
|
2023-03-07 14:42:58 +01:00
|
|
|
}
|
|
|
|
}
|
2023-03-15 16:08:43 +01:00
|
|
|
assert!(!path_docids.is_empty());
|
|
|
|
for condition in path {
|
|
|
|
used_conditions.insert(Interned::new(*condition));
|
|
|
|
}
|
2023-03-07 14:42:58 +01:00
|
|
|
bucket |= &path_docids;
|
2023-03-08 13:26:29 +01:00
|
|
|
// Reduce the size of the universe so that we can more optimistically discard candidate paths
|
2023-03-07 14:42:58 +01:00
|
|
|
universe -= path_docids;
|
2023-03-15 12:52:40 +01:00
|
|
|
|
|
|
|
if universe.is_empty() {
|
|
|
|
Ok(ControlFlow::Break(()))
|
|
|
|
} else {
|
|
|
|
Ok(ControlFlow::Continue(()))
|
|
|
|
}
|
2023-03-07 14:42:58 +01:00
|
|
|
},
|
|
|
|
)?;
|
2023-03-02 21:27:42 +01:00
|
|
|
|
2023-03-13 14:03:48 +01:00
|
|
|
G::log_state(
|
|
|
|
&original_graph,
|
|
|
|
&paths,
|
2023-03-15 16:08:43 +01:00
|
|
|
empty_paths_cache,
|
2023-03-13 14:03:48 +01:00
|
|
|
original_universe,
|
2023-03-15 16:08:43 +01:00
|
|
|
all_distances,
|
2023-03-13 14:03:48 +01:00
|
|
|
cost,
|
|
|
|
logger,
|
|
|
|
);
|
2023-02-23 13:13:19 +01:00
|
|
|
|
2023-03-15 16:08:43 +01:00
|
|
|
// We modify the next query graph so that it only contains the subgraph
|
|
|
|
// that was used to compute this bucket
|
|
|
|
// But we only do it in case the bucket length is >1, because otherwise
|
|
|
|
// we know the child ranking rule won't be called anyway
|
|
|
|
let mut next_query_graph = original_graph.query_graph;
|
|
|
|
next_query_graph.simplify();
|
|
|
|
if bucket.len() > 1 {
|
|
|
|
// 1. Gather all the words and phrases used in the computation of this bucket
|
|
|
|
let mut used_words = HashSet::new();
|
|
|
|
let mut used_phrases = HashSet::new();
|
|
|
|
for condition in used_conditions.iter() {
|
|
|
|
let condition = graph.conditions_interner.get(condition);
|
|
|
|
used_words.extend(G::words_used_by_edge_condition(ctx, condition)?);
|
|
|
|
used_phrases.extend(G::phrases_used_by_edge_condition(ctx, condition)?);
|
|
|
|
}
|
|
|
|
// 2. Remove the unused words and phrases from all the nodes in the graph
|
|
|
|
let mut nodes_to_remove = vec![];
|
|
|
|
for (node_id, node) in next_query_graph.nodes.iter_mut() {
|
|
|
|
let term = match &mut node.data {
|
|
|
|
QueryNodeData::Term(term) => term,
|
|
|
|
QueryNodeData::Deleted | QueryNodeData::Start | QueryNodeData::End => continue,
|
|
|
|
};
|
|
|
|
if let Some(new_term) = ctx
|
|
|
|
.term_interner
|
|
|
|
.get(term.value)
|
|
|
|
.removing_forbidden_terms(&used_words, &used_phrases)
|
|
|
|
{
|
|
|
|
if new_term.is_empty() {
|
|
|
|
nodes_to_remove.push(node_id);
|
|
|
|
} else {
|
|
|
|
term.value = ctx.term_interner.insert(new_term);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// 3. Remove the empty nodes from the graph
|
|
|
|
next_query_graph.remove_nodes(&nodes_to_remove);
|
|
|
|
}
|
2023-02-21 09:48:49 +01:00
|
|
|
|
|
|
|
self.state = Some(state);
|
|
|
|
|
|
|
|
Ok(Some(RankingRuleOutput { query: next_query_graph, candidates: bucket }))
|
|
|
|
}
|
|
|
|
|
|
|
|
fn end_iteration(
|
|
|
|
&mut self,
|
2023-03-13 14:03:48 +01:00
|
|
|
_ctx: &mut SearchContext<'ctx>,
|
2023-02-28 11:49:24 +01:00
|
|
|
_logger: &mut dyn SearchLogger<QueryGraph>,
|
2023-02-21 09:48:49 +01:00
|
|
|
) {
|
|
|
|
self.state = None;
|
|
|
|
}
|
|
|
|
}
|