diff --git a/milli/src/search/new/db_cache.rs b/milli/src/search/new/db_cache.rs index b1f57fd0e..2fa92900c 100644 --- a/milli/src/search/new/db_cache.rs +++ b/milli/src/search/new/db_cache.rs @@ -5,7 +5,7 @@ use fxhash::FxHashMap; use heed::types::ByteSlice; use heed::{BytesEncode, Database, RoTxn}; -use super::interner::{Interned, Interner}; +use super::interner::{DedupInterner, Interned}; use crate::{Index, Result}; /// A cache storing pointers to values in the LMDB databases. @@ -53,7 +53,7 @@ impl<'ctx> DatabaseCache<'ctx> { &mut self, index: &Index, txn: &'ctx RoTxn, - word_interner: &Interner, + word_interner: &DedupInterner, word: Interned, ) -> Result> { Self::get_value( @@ -69,7 +69,7 @@ impl<'ctx> DatabaseCache<'ctx> { &mut self, index: &Index, txn: &'ctx RoTxn, - word_interner: &Interner, + word_interner: &DedupInterner, prefix: Interned, ) -> Result> { Self::get_value( @@ -85,7 +85,7 @@ impl<'ctx> DatabaseCache<'ctx> { &mut self, index: &Index, txn: &'ctx RoTxn, - word_interner: &Interner, + word_interner: &DedupInterner, word1: Interned, word2: Interned, proximity: u8, @@ -103,7 +103,7 @@ impl<'ctx> DatabaseCache<'ctx> { &mut self, index: &Index, txn: &'ctx RoTxn, - word_interner: &Interner, + word_interner: &DedupInterner, word1: Interned, prefix2: Interned, proximity: u8, @@ -120,7 +120,7 @@ impl<'ctx> DatabaseCache<'ctx> { &mut self, index: &Index, txn: &'ctx RoTxn, - word_interner: &Interner, + word_interner: &DedupInterner, left_prefix: Interned, right: Interned, proximity: u8, diff --git a/milli/src/search/new/graph_based_ranking_rule.rs b/milli/src/search/new/graph_based_ranking_rule.rs index 3281ffd2b..5a28ab58a 100644 --- a/milli/src/search/new/graph_based_ranking_rule.rs +++ b/milli/src/search/new/graph_based_ranking_rule.rs @@ -38,13 +38,16 @@ That is we find the documents where either: use roaring::RoaringBitmap; +use super::interner::MappedInterner; use super::logger::SearchLogger; +use super::query_graph::QueryNode; use super::ranking_rule_graph::{ - EdgeCondition, EdgeConditionsCache, EmptyPathsCache, ProximityGraph, RankingRuleGraph, + DeadEndPathCache, EdgeCondition, EdgeConditionDocIdsCache, ProximityGraph, RankingRuleGraph, RankingRuleGraphTrait, TypoGraph, }; use super::small_bitmap::SmallBitmap; use super::{QueryGraph, RankingRule, RankingRuleOutput, SearchContext}; +use crate::search::new::interner::Interned; use crate::Result; pub type Proximity = GraphBasedRankingRule; @@ -79,12 +82,12 @@ pub struct GraphBasedRankingRuleState { /// The current graph graph: RankingRuleGraph, /// Cache to retrieve the docids associated with each edge - edge_conditions_cache: EdgeConditionsCache, + edge_conditions_cache: EdgeConditionDocIdsCache, /// Cache used to optimistically discard paths that resolve to no documents. - empty_paths_cache: EmptyPathsCache, + empty_paths_cache: DeadEndPathCache, /// A structure giving the list of possible costs from each node to the end node, /// along with a set of unavoidable edges that must be traversed to achieve that distance. - all_distances: Vec>, + all_distances: MappedInterner)>, QueryNode>, /// An index in the first element of `all_distances`, giving the cost of the next bucket cur_distance_idx: usize, } @@ -95,12 +98,12 @@ pub struct GraphBasedRankingRuleState { fn remove_empty_edges<'ctx, G: RankingRuleGraphTrait>( ctx: &mut SearchContext<'ctx>, graph: &mut RankingRuleGraph, - edge_docids_cache: &mut EdgeConditionsCache, + edge_docids_cache: &mut EdgeConditionDocIdsCache, universe: &RoaringBitmap, - empty_paths_cache: &mut EmptyPathsCache, + empty_paths_cache: &mut DeadEndPathCache, ) -> Result<()> { - for edge_index in 0..graph.edges_store.len() as u16 { - let Some(edge) = graph.edges_store[edge_index as usize].as_ref() else { + for edge_id in graph.edges_store.indexes() { + let Some(edge) = graph.edges_store.get(edge_id).as_ref() else { continue; }; let condition = edge.condition; @@ -110,8 +113,8 @@ fn remove_empty_edges<'ctx, G: RankingRuleGraphTrait>( EdgeCondition::Conditional(condition) => { let docids = edge_docids_cache.get_edge_docids(ctx, condition, graph, universe)?; if docids.is_disjoint(universe) { - graph.remove_ranking_rule_edge(edge_index); - empty_paths_cache.forbid_edge(edge_index); + graph.remove_edges_with_condition(condition); + empty_paths_cache.add_condition(condition); edge_docids_cache.cache.remove(&condition); continue; } @@ -133,8 +136,8 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase query_graph: &QueryGraph, ) -> Result<()> { let mut graph = RankingRuleGraph::build(ctx, query_graph.clone())?; - let mut edge_docids_cache = EdgeConditionsCache::default(); - let mut empty_paths_cache = EmptyPathsCache::new(graph.edges_store.len() as u16); + let mut edge_docids_cache = EdgeConditionDocIdsCache::default(); + let mut empty_paths_cache = DeadEndPathCache::new(&graph.conditions_interner); // First simplify the graph as much as possible, by computing the docids of the edges // within the rule's universe and removing the edges that have no associated docids. @@ -187,7 +190,7 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase // If the cur_distance_idx does not point to a valid cost in the `all_distances` // structure, then we have computed all the buckets and can return. if state.cur_distance_idx - >= state.all_distances[state.graph.query_graph.root_node as usize].len() + >= state.all_distances.get(state.graph.query_graph.root_node).len() { self.state = None; return Ok(None); @@ -195,7 +198,7 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase // Retrieve the cost of the paths to compute let (cost, _) = - state.all_distances[state.graph.query_graph.root_node as usize][state.cur_distance_idx]; + state.all_distances.get(state.graph.query_graph.root_node)[state.cur_distance_idx]; state.cur_distance_idx += 1; let mut bucket = RoaringBitmap::new(); @@ -226,7 +229,7 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase // Updating the empty_paths_cache helps speed up the execution of `visit_paths_of_cost` and reduces // the number of future candidate paths given by that same function. graph.visit_paths_of_cost( - graph.query_graph.root_node as usize, + graph.query_graph.root_node, cost, all_distances, empty_paths_cache, @@ -237,29 +240,27 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase // We store the edges and their docids in vectors in case the path turns out to be // empty and we need to figure out why it was empty. - let mut visited_edges = vec![]; - let mut cached_edge_docids = vec![]; + let mut visited_conditions = vec![]; + let mut cached_edge_docids = + graph.conditions_interner.map(|_| RoaringBitmap::new()); - for &edge_index in path { - visited_edges.push(edge_index); - let edge = graph.edges_store[edge_index as usize].as_ref().unwrap(); - let condition = match edge.condition { - EdgeCondition::Unconditional => continue, - EdgeCondition::Conditional(condition) => condition, - }; + for &condition_interned_raw in path { + let condition = Interned::new(condition_interned_raw); + visited_conditions.push(condition_interned_raw); let edge_docids = edge_docids_cache.get_edge_docids(ctx, condition, graph, &universe)?; - cached_edge_docids.push((edge_index, edge_docids.clone())); + *cached_edge_docids.get_mut(condition) = edge_docids.clone(); // If the edge is empty, then the path will be empty as well, we update the graph // and caches accordingly and skip to the next candidate path. if edge_docids.is_disjoint(&universe) { // 1. Store in the cache that this edge is empty for this universe - empty_paths_cache.forbid_edge(edge_index); + empty_paths_cache.add_condition(condition); // 2. remove this edge from the ranking rule graph - graph.remove_ranking_rule_edge(edge_index); + // ouch, no! :( need to link a condition to one or more ranking rule edges + graph.remove_edges_with_condition(condition); // 3. Also remove the entry from the edge_docids_cache, since we don't need it anymore edge_docids_cache.cache.remove(&condition); return Ok(()); @@ -270,17 +271,18 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase if path_docids.is_disjoint(&universe) { // First, we know that this path is empty, and thus any path // that is a superset of it will also be empty. - empty_paths_cache.forbid_prefix(&visited_edges); + empty_paths_cache.add_prefix(&visited_conditions); // Second, if the intersection between this edge and any // previous one is disjoint with the universe, // then we also know that any path containing the same couple of // edges will also be empty. - for (edge_index2, edge_docids2) in - cached_edge_docids[..cached_edge_docids.len() - 1].iter() - { + for (past_condition, edge_docids2) in cached_edge_docids.iter() { + if past_condition == condition { + continue; + }; let intersection = edge_docids & edge_docids2; if intersection.is_disjoint(&universe) { - empty_paths_cache.forbid_couple_edges(*edge_index2, edge_index); + empty_paths_cache.add_condition_couple(past_condition, condition); } } // We should maybe instead try to compute: @@ -291,6 +293,7 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase bucket |= &path_docids; // Reduce the size of the universe so that we can more optimistically discard candidate paths universe -= path_docids; + // TODO: if the universe is empty, stop iterating Ok(()) }, )?; diff --git a/milli/src/search/new/interner.rs b/milli/src/search/new/interner.rs index 55c343cd5..da8473e92 100644 --- a/milli/src/search/new/interner.rs +++ b/milli/src/search/new/interner.rs @@ -1,3 +1,4 @@ +use std::fmt; use std::hash::Hash; use std::marker::PhantomData; @@ -5,14 +6,16 @@ use fxhash::FxHashMap; /// An index within a [`Interner`] structure. pub struct Interned { - idx: u32, + idx: u16, _phantom: PhantomData, } - impl Interned { - fn new(idx: u32) -> Self { + pub fn new(idx: u16) -> Self { Self { idx, _phantom: PhantomData } } + pub fn into_inner(self) -> u16 { + self.idx + } } // TODO: the stable store should be replaced by a bump allocator @@ -34,17 +37,22 @@ impl Interned { /// be copied, compared, and hashed efficiently. An immutable reference to the original value /// can be retrieved using `self.get(interned)`. #[derive(Clone)] -pub struct Interner { +pub struct DedupInterner { stable_store: Vec, lookup: FxHashMap>, } -impl Default for Interner { +impl Default for DedupInterner { fn default() -> Self { Self { stable_store: Default::default(), lookup: Default::default() } } } +impl DedupInterner { + pub fn freeze(self) -> FixedSizeInterner { + FixedSizeInterner { stable_store: self.stable_store } + } +} -impl Interner +impl DedupInterner where T: Clone + Eq + Hash, { @@ -52,8 +60,9 @@ where if let Some(interned) = self.lookup.get(&s) { *interned } else { + assert!(self.stable_store.len() < u16::MAX as usize); self.stable_store.push(s.clone()); - let interned = Interned::new(self.stable_store.len() as u32 - 1); + let interned = Interned::new(self.stable_store.len() as u16 - 1); self.lookup.insert(s, interned); interned } @@ -62,7 +71,93 @@ where &self.stable_store[interned.idx as usize] } } +#[derive(Clone)] +pub struct Interner { + stable_store: Vec, +} +impl Default for Interner { + fn default() -> Self { + Self { stable_store: Default::default() } + } +} +impl Interner { + pub fn freeze(self) -> FixedSizeInterner { + FixedSizeInterner { stable_store: self.stable_store } + } + pub fn push(&mut self, s: T) -> Interned { + assert!(self.stable_store.len() < u16::MAX as usize); + self.stable_store.push(s); + Interned::new(self.stable_store.len() as u16 - 1) + } +} +#[derive(Clone)] +pub struct FixedSizeInterner { + stable_store: Vec, +} +impl FixedSizeInterner { + pub fn new(length: u16, value: T) -> Self { + Self { stable_store: vec![value; length as usize] } + } +} + +impl FixedSizeInterner { + pub fn from_vec(store: Vec) -> Self { + Self { stable_store: store } + } + pub fn get(&self, interned: Interned) -> &T { + &self.stable_store[interned.idx as usize] + } + pub fn get_mut(&mut self, interned: Interned) -> &mut T { + &mut self.stable_store[interned.idx as usize] + } + + pub fn len(&self) -> u16 { + self.stable_store.len() as u16 + } + + pub fn map(&self, map_f: impl Fn(&T) -> U) -> MappedInterner { + MappedInterner { + stable_store: self.stable_store.iter().map(map_f).collect(), + _phantom: PhantomData, + } + } + pub fn indexes(&self) -> impl Iterator> { + (0..self.stable_store.len()).map(|i| Interned::new(i as u16)) + } + pub fn iter(&self) -> impl Iterator, &T)> { + self.stable_store.iter().enumerate().map(|(i, x)| (Interned::new(i as u16), x)) + } + pub fn iter_mut(&mut self) -> impl Iterator, &mut T)> { + self.stable_store.iter_mut().enumerate().map(|(i, x)| (Interned::new(i as u16), x)) + } +} +#[derive(Clone)] +pub struct MappedInterner { + stable_store: Vec, + _phantom: PhantomData, +} + +impl MappedInterner { + pub fn get(&self, interned: Interned) -> &T { + &self.stable_store[interned.idx as usize] + } + pub fn get_mut(&mut self, interned: Interned) -> &mut T { + &mut self.stable_store[interned.idx as usize] + } + pub fn map(&self, map_f: impl Fn(&T) -> U) -> MappedInterner { + MappedInterner { + stable_store: self.stable_store.iter().map(map_f).collect(), + _phantom: PhantomData, + } + } + pub fn iter(&self) -> impl Iterator, &T)> { + self.stable_store.iter().enumerate().map(|(i, x)| (Interned::new(i as u16), x)) + } + pub fn iter_mut(&mut self) -> impl Iterator, &mut T)> { + self.stable_store.iter_mut().enumerate().map(|(i, x)| (Interned::new(i as u16), x)) + } +} // Interned boilerplate implementations impl Hash for Interned { @@ -97,3 +192,14 @@ impl Clone for Interned { } impl Copy for Interned {} + +impl fmt::Display for Interned { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt::Display::fmt(&self.idx, f) + } +} +impl fmt::Debug for Interned { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt::Debug::fmt(&self.idx, f) + } +} diff --git a/milli/src/search/new/logger/detailed.rs b/milli/src/search/new/logger/detailed.rs index 575d5b0bf..fb5a296bd 100644 --- a/milli/src/search/new/logger/detailed.rs +++ b/milli/src/search/new/logger/detailed.rs @@ -6,10 +6,12 @@ use std::time::Instant; use rand::random; use roaring::RoaringBitmap; +use crate::search::new::interner::{Interned, MappedInterner}; +use crate::search::new::query_graph::QueryNodeData; use crate::search::new::query_term::{LocatedQueryTerm, QueryTerm}; use crate::search::new::ranking_rule_graph::{ - Edge, EdgeCondition, EmptyPathsCache, ProximityGraph, RankingRuleGraph, RankingRuleGraphTrait, - TypoGraph, + DeadEndPathCache, Edge, EdgeCondition, ProximityEdge, ProximityGraph, RankingRuleGraph, + RankingRuleGraphTrait, TypoEdge, TypoGraph, }; use crate::search::new::small_bitmap::SmallBitmap; use crate::search::new::{QueryGraph, QueryNode, SearchContext}; @@ -42,17 +44,17 @@ pub enum SearchEvents { ProximityState { graph: RankingRuleGraph, paths: Vec>, - empty_paths_cache: EmptyPathsCache, + empty_paths_cache: DeadEndPathCache, universe: RoaringBitmap, - distances: Vec>, + distances: MappedInterner)>, QueryNode>, cost: u16, }, TypoState { graph: RankingRuleGraph, paths: Vec>, - empty_paths_cache: EmptyPathsCache, + empty_paths_cache: DeadEndPathCache, universe: RoaringBitmap, - distances: Vec>, + distances: MappedInterner)>, QueryNode>, cost: u16, }, RankingRuleSkipBucket { @@ -168,9 +170,9 @@ impl SearchLogger for DetailedSearchLogger { &mut self, query_graph: &RankingRuleGraph, paths_map: &[Vec], - empty_paths_cache: &EmptyPathsCache, + empty_paths_cache: &DeadEndPathCache, universe: &RoaringBitmap, - distances: Vec>, + distances: &MappedInterner)>, QueryNode>, cost: u16, ) { self.events.push(SearchEvents::ProximityState { @@ -178,7 +180,7 @@ impl SearchLogger for DetailedSearchLogger { paths: paths_map.to_vec(), empty_paths_cache: empty_paths_cache.clone(), universe: universe.clone(), - distances, + distances: distances.clone(), cost, }) } @@ -187,9 +189,9 @@ impl SearchLogger for DetailedSearchLogger { &mut self, query_graph: &RankingRuleGraph, paths_map: &[Vec], - empty_paths_cache: &EmptyPathsCache, + empty_paths_cache: &DeadEndPathCache, universe: &RoaringBitmap, - distances: Vec>, + distances: &MappedInterner)>, QueryNode>, cost: u16, ) { self.events.push(SearchEvents::TypoState { @@ -197,7 +199,7 @@ impl SearchLogger for DetailedSearchLogger { paths: paths_map.to_vec(), empty_paths_cache: empty_paths_cache.clone(), universe: universe.clone(), - distances, + distances: distances.clone(), cost, }) } @@ -424,15 +426,15 @@ results.{random} {{ writeln!(&mut file, "}}").unwrap(); } - fn query_node_d2_desc( + fn query_node_d2_desc( ctx: &mut SearchContext, - node_idx: usize, + node_idx: Interned, node: &QueryNode, - distances: &[(u16, SmallBitmap)], + distances: &[(u16, SmallBitmap)], file: &mut File, ) { - match &node { - QueryNode::Term(LocatedQueryTerm { value, .. }) => { + match &node.data { + QueryNodeData::Term(LocatedQueryTerm { value, .. }) => { let QueryTerm { original, zero_typo, @@ -496,11 +498,11 @@ shape: class" writeln!(file, "}}").unwrap(); } - QueryNode::Deleted => panic!(), - QueryNode::Start => { + QueryNodeData::Deleted => panic!(), + QueryNodeData::Start => { writeln!(file, "{node_idx} : START").unwrap(); } - QueryNode::End => { + QueryNodeData::End => { writeln!(file, "{node_idx} : END").unwrap(); } } @@ -511,14 +513,14 @@ shape: class" file: &mut File, ) { writeln!(file, "direction: right").unwrap(); - for node in 0..query_graph.nodes.len() { - if matches!(query_graph.nodes[node], QueryNode::Deleted) { + for (node_id, node) in query_graph.nodes.iter() { + if matches!(node.data, QueryNodeData::Deleted) { continue; } - Self::query_node_d2_desc(ctx, node, &query_graph.nodes[node], &[], file); + Self::query_node_d2_desc::(ctx, node_id, node, &[], file); - for edge in query_graph.edges[node].successors.iter() { - writeln!(file, "{node} -> {edge};\n").unwrap(); + for edge in node.successors.iter() { + writeln!(file, "{node_id} -> {edge};\n").unwrap(); } } } @@ -526,31 +528,28 @@ shape: class" ctx: &mut SearchContext, graph: &RankingRuleGraph, paths: &[Vec], - _empty_paths_cache: &EmptyPathsCache, - distances: Vec>, + _empty_paths_cache: &DeadEndPathCache, + distances: MappedInterner)>, QueryNode>, file: &mut File, ) { writeln!(file, "direction: right").unwrap(); writeln!(file, "Proximity Graph {{").unwrap(); - for (node_idx, node) in graph.query_graph.nodes.iter().enumerate() { - if matches!(node, QueryNode::Deleted) { + for (node_idx, node) in graph.query_graph.nodes.iter() { + if matches!(&node.data, QueryNodeData::Deleted) { continue; } - let distances = &distances[node_idx]; - Self::query_node_d2_desc(ctx, node_idx, node, distances.as_slice(), file); + let distances = &distances.get(node_idx); + Self::query_node_d2_desc::(ctx, node_idx, node, distances, file); } - for edge in graph.edges_store.iter().flatten() { - let Edge { source_node, dest_node, condition: details, .. } = edge; + for (_edge_id, edge) in graph.edges_store.iter() { + let Some(edge) = edge else { continue }; + let Edge { source_node, dest_node, condition: details, cost } = edge; match &details { EdgeCondition::Unconditional => { - writeln!( - file, - "{source_node} -> {dest_node} : \"always cost {cost}\"", - cost = edge.cost, - ) - .unwrap(); + writeln!(file, "{source_node} -> {dest_node} : \"always cost {cost}\"",) + .unwrap(); } EdgeCondition::Conditional(condition) => { let condition = graph.conditions_interner.get(*condition); @@ -590,39 +589,19 @@ shape: class" // } // writeln!(file, "}}").unwrap(); } - fn edge_d2_description( - ctx: &mut SearchContext, + fn condition_d2_description( + _ctx: &mut SearchContext, graph: &RankingRuleGraph, - edge_idx: u16, + condition_id: Interned, file: &mut File, ) { - let Edge { source_node, dest_node, cost, .. } = - graph.edges_store[edge_idx as usize].as_ref().unwrap(); - let source_node = &graph.query_graph.nodes[*source_node as usize]; - let source_node_desc = match source_node { - QueryNode::Term(term) => { - let term = ctx.term_interner.get(term.value); - ctx.word_interner.get(term.original).to_owned() - } - QueryNode::Deleted => panic!(), - QueryNode::Start => "START".to_owned(), - QueryNode::End => "END".to_owned(), - }; - let dest_node = &graph.query_graph.nodes[*dest_node as usize]; - let dest_node_desc = match dest_node { - QueryNode::Term(term) => { - let term = ctx.term_interner.get(term.value); - ctx.word_interner.get(term.original).to_owned() - } - QueryNode::Deleted => panic!(), - QueryNode::Start => "START".to_owned(), - QueryNode::End => "END".to_owned(), - }; + let condition = graph.conditions_interner.get(condition_id); writeln!( file, - "{edge_idx}: \"{source_node_desc}->{dest_node_desc} [{cost}]\" {{ + "{condition_id}: \"{}\" {{ shape: class - }}" + }}", + R::label_for_edge_condition(condition) ) .unwrap(); } @@ -632,12 +611,12 @@ shape: class" paths: &[Vec], file: &mut File, ) { - for (path_idx, edge_indexes) in paths.iter().enumerate() { + for (path_idx, condition_indexes) in paths.iter().enumerate() { writeln!(file, "{path_idx} {{").unwrap(); - for edge_idx in edge_indexes.iter() { - Self::edge_d2_description(ctx, graph, *edge_idx, file); + for condition in condition_indexes.iter() { + Self::condition_d2_description(ctx, graph, Interned::new(*condition), file); } - for couple_edges in edge_indexes.windows(2) { + for couple_edges in condition_indexes.windows(2) { let [src_edge_idx, dest_edge_idx] = couple_edges else { panic!() }; writeln!(file, "{src_edge_idx} -> {dest_edge_idx}").unwrap(); } diff --git a/milli/src/search/new/logger/mod.rs b/milli/src/search/new/logger/mod.rs index c5f3e5351..ff500d4b8 100644 --- a/milli/src/search/new/logger/mod.rs +++ b/milli/src/search/new/logger/mod.rs @@ -3,7 +3,11 @@ pub mod detailed; use roaring::RoaringBitmap; -use super::ranking_rule_graph::{EmptyPathsCache, ProximityGraph, RankingRuleGraph, TypoGraph}; +use super::interner::MappedInterner; +use super::query_graph::QueryNode; +use super::ranking_rule_graph::{ + DeadEndPathCache, ProximityEdge, ProximityGraph, RankingRuleGraph, TypoEdge, TypoGraph, +}; use super::small_bitmap::SmallBitmap; use super::{RankingRule, RankingRuleQueryTrait}; @@ -62,9 +66,9 @@ pub trait SearchLogger { &mut self, query_graph: &RankingRuleGraph, paths: &[Vec], - empty_paths_cache: &EmptyPathsCache, + empty_paths_cache: &DeadEndPathCache, universe: &RoaringBitmap, - distances: Vec>, + distances: &MappedInterner)>, QueryNode>, cost: u16, ); @@ -73,9 +77,9 @@ pub trait SearchLogger { &mut self, query_graph: &RankingRuleGraph, paths: &[Vec], - empty_paths_cache: &EmptyPathsCache, + empty_paths_cache: &DeadEndPathCache, universe: &RoaringBitmap, - distances: Vec>, + distances: &MappedInterner)>, QueryNode>, cost: u16, ); } @@ -133,9 +137,9 @@ impl SearchLogger for DefaultSearchLogger { &mut self, _query_graph: &RankingRuleGraph, _paths_map: &[Vec], - _empty_paths_cache: &EmptyPathsCache, + _empty_paths_cache: &DeadEndPathCache, _universe: &RoaringBitmap, - _distances: Vec>, + _distances: &MappedInterner)>, QueryNode>, _cost: u16, ) { } @@ -144,9 +148,9 @@ impl SearchLogger for DefaultSearchLogger { &mut self, _query_graph: &RankingRuleGraph, _paths: &[Vec], - _empty_paths_cache: &EmptyPathsCache, + _empty_paths_cache: &DeadEndPathCache, _universe: &RoaringBitmap, - _distances: Vec>, + _distances: &MappedInterner)>, QueryNode>, _cost: u16, ) { } diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 1eaa6d347..11420545c 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -26,7 +26,8 @@ use query_graph::{QueryGraph, QueryNode}; pub use ranking_rules::{bucket_sort, RankingRule, RankingRuleOutput, RankingRuleQueryTrait}; use roaring::RoaringBitmap; -use self::interner::Interner; +use self::interner::DedupInterner; +use self::query_graph::QueryNodeData; use self::query_term::{Phrase, QueryTerm}; use self::ranking_rules::PlaceholderQuery; use self::resolve_query_graph::{resolve_query_graph, QueryTermDocIdsCache}; @@ -39,9 +40,9 @@ pub struct SearchContext<'ctx> { pub index: &'ctx Index, pub txn: &'ctx RoTxn<'ctx>, pub db_cache: DatabaseCache<'ctx>, - pub word_interner: Interner, - pub phrase_interner: Interner, - pub term_interner: Interner, + pub word_interner: DedupInterner, + pub phrase_interner: DedupInterner, + pub term_interner: DedupInterner, pub term_docids: QueryTermDocIdsCache, } impl<'ctx> SearchContext<'ctx> { @@ -70,12 +71,12 @@ fn resolve_maximally_reduced_query_graph<'ctx>( let mut positions_to_remove = match matching_strategy { TermsMatchingStrategy::Last => { let mut all_positions = BTreeSet::new(); - for n in query_graph.nodes.iter() { - match n { - QueryNode::Term(term) => { + for (_, n) in query_graph.nodes.iter() { + match &n.data { + QueryNodeData::Term(term) => { all_positions.extend(term.positions.clone().into_iter()); } - QueryNode::Deleted | QueryNode::Start | QueryNode::End => {} + QueryNodeData::Deleted | QueryNodeData::Start | QueryNodeData::End => {} } } all_positions.into_iter().collect() @@ -200,7 +201,7 @@ fn get_ranking_rules_for_query_graph_search<'ctx>( continue; } asc.insert(field); - todo!(); + // TODO } crate::Criterion::Desc(field) => { if desc.contains(&field) { @@ -295,45 +296,48 @@ mod tests { println!("nbr docids: {}", index.documents_ids(&txn).unwrap().len()); - // loop { - let start = Instant::now(); + loop { + let start = Instant::now(); - let mut logger = crate::search::new::logger::detailed::DetailedSearchLogger::new("log"); - let mut ctx = SearchContext::new(&index, &txn); - let results = execute_search( - &mut ctx, - "sun flower s are the best", - TermsMatchingStrategy::Last, - None, - 0, - 20, - &mut DefaultSearchLogger, - &mut logger, - ) - .unwrap(); + // let mut logger = crate::search::new::logger::detailed::DetailedSearchLogger::new("log"); + let mut ctx = SearchContext::new(&index, &txn); + let results = execute_search( + &mut ctx, + // "which a the releases from poison by the government", + // "sun flower s are the best", + "zero config", + TermsMatchingStrategy::Last, + None, + 0, + 20, + &mut DefaultSearchLogger, + &mut DefaultSearchLogger, + //&mut logger, + ) + .unwrap(); - logger.write_d2_description(&mut ctx); + // logger.write_d2_description(&mut ctx); - let elapsed = start.elapsed(); - println!("{}us", elapsed.as_micros()); + let elapsed = start.elapsed(); + println!("{}us", elapsed.as_micros()); - let _documents = index - .documents(&txn, results.iter().copied()) - .unwrap() - .into_iter() - .map(|(id, obkv)| { - let mut object = serde_json::Map::default(); - for (fid, fid_name) in index.fields_ids_map(&txn).unwrap().iter() { - let value = obkv.get(fid).unwrap(); - let value: serde_json::Value = serde_json::from_slice(value).unwrap(); - object.insert(fid_name.to_owned(), value); - } - (id, serde_json::to_string_pretty(&object).unwrap()) - }) - .collect::>(); + let _documents = index + .documents(&txn, results.iter().copied()) + .unwrap() + .into_iter() + .map(|(id, obkv)| { + let mut object = serde_json::Map::default(); + for (fid, fid_name) in index.fields_ids_map(&txn).unwrap().iter() { + let value = obkv.get(fid).unwrap(); + let value: serde_json::Value = serde_json::from_slice(value).unwrap(); + object.insert(fid_name.to_owned(), value); + } + (id, serde_json::to_string_pretty(&object).unwrap()) + }) + .collect::>(); - println!("{}us: {:?}", elapsed.as_micros(), results); - // } + println!("{}us: {:?}", elapsed.as_micros(), results); + } // for (id, _document) in documents { // println!("{id}:"); // // println!("{document}"); diff --git a/milli/src/search/new/query_graph.rs b/milli/src/search/new/query_graph.rs index 7bed15571..d487a644f 100644 --- a/milli/src/search/new/query_graph.rs +++ b/milli/src/search/new/query_graph.rs @@ -1,10 +1,11 @@ +use std::collections::HashSet; + +use super::interner::{FixedSizeInterner, Interned}; use super::query_term::{self, number_of_typos_allowed, LocatedQueryTerm}; use super::small_bitmap::SmallBitmap; use super::SearchContext; use crate::Result; -pub const QUERY_GRAPH_NODE_LENGTH_LIMIT: u16 = 64; - /// A node of the [`QueryGraph`]. /// /// There are four types of nodes: @@ -15,22 +16,19 @@ pub const QUERY_GRAPH_NODE_LENGTH_LIMIT: u16 = 64; /// 4. `Term` is a regular node representing a word or combination of words /// from the user query. #[derive(Clone)] -pub enum QueryNode { +pub struct QueryNode { + pub data: QueryNodeData, + pub predecessors: SmallBitmap, + pub successors: SmallBitmap, +} +#[derive(Clone)] +pub enum QueryNodeData { Term(LocatedQueryTerm), Deleted, Start, End, } -/// The edges associated with a node in the query graph. -#[derive(Clone)] -pub struct Edges { - /// Set of nodes which have an edge going to the current node - pub predecessors: SmallBitmap, - /// Set of nodes which are reached by an edge from the current node - pub successors: SmallBitmap, -} - /** A graph representing all the ways to interpret the user's search query. @@ -78,55 +76,45 @@ and the transformations that were done on the query graph). #[derive(Clone)] pub struct QueryGraph { /// The index of the start node within `self.nodes` - pub root_node: u16, + pub root_node: Interned, /// The index of the end node within `self.nodes` - pub end_node: u16, + pub end_node: Interned, /// The list of all query nodes - pub nodes: Vec, - /// The list of all node edges - pub edges: Vec, + pub nodes: FixedSizeInterner, } -impl Default for QueryGraph { - /// Create a new QueryGraph with two disconnected nodes: the root and end nodes. - fn default() -> Self { - let nodes = vec![QueryNode::Start, QueryNode::End]; - let edges = vec![ - Edges { - predecessors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT), - successors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT), - }, - Edges { - predecessors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT), - successors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT), - }, - ]; +// impl Default for QueryGraph { +// /// Create a new QueryGraph with two disconnected nodes: the root and end nodes. +// fn default() -> Self { +// let nodes = vec![ +// QueryNode { +// data: QueryNodeData::Start, +// predecessors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT), +// successors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT), +// }, +// QueryNode { +// data: QueryNodeData::End, +// predecessors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT), +// successors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT), +// }, +// ]; - Self { root_node: 0, end_node: 1, nodes, edges } - } -} +// Self { root_node: 0, end_node: 1, nodes } +// } +// } impl QueryGraph { /// Connect all the given predecessor nodes to the given successor node - fn connect_to_node(&mut self, from_nodes: &[u16], to_node: u16) { + fn connect_to_node( + &mut self, + from_nodes: &[Interned], + to_node: Interned, + ) { for &from_node in from_nodes { - self.edges[from_node as usize].successors.insert(to_node); - self.edges[to_node as usize].predecessors.insert(from_node); + self.nodes.get_mut(from_node).successors.insert(to_node); + self.nodes.get_mut(to_node).predecessors.insert(from_node); } } - /// Add the given node to the graph and connect it to all the given predecessor nodes - fn add_node(&mut self, from_nodes: &[u16], node: QueryNode) -> u16 { - let new_node_idx = self.nodes.len() as u16; - assert!(new_node_idx <= QUERY_GRAPH_NODE_LENGTH_LIMIT); - self.nodes.push(node); - self.edges.push(Edges { - predecessors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT), - successors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT), - }); - self.connect_to_node(from_nodes, new_node_idx); - - new_node_idx - } } impl QueryGraph { @@ -136,17 +124,27 @@ impl QueryGraph { let mut empty_nodes = vec![]; - let mut graph = QueryGraph::default(); + let mut predecessors: Vec> = vec![HashSet::new(), HashSet::new()]; + let mut successors: Vec> = vec![HashSet::new(), HashSet::new()]; + let mut nodes_data: Vec = vec![QueryNodeData::Start, QueryNodeData::End]; + let root_node = 0; + let end_node = 1; // TODO: we could consider generalizing to 4,5,6,7,etc. ngrams let (mut prev2, mut prev1, mut prev0): (Vec, Vec, Vec) = - (vec![], vec![], vec![graph.root_node]); + (vec![], vec![], vec![root_node]); for term_idx in 0..terms.len() { let term0 = &terms[term_idx]; let mut new_nodes = vec![]; - let new_node_idx = graph.add_node(&prev0, QueryNode::Term(term0.clone())); + let new_node_idx = add_node( + &mut nodes_data, + QueryNodeData::Term(term0.clone()), + &prev0, + &mut successors, + &mut predecessors, + ); new_nodes.push(new_node_idx); if term0.is_empty(&ctx.term_interner) { empty_nodes.push(new_node_idx); @@ -156,7 +154,13 @@ impl QueryGraph { if let Some(ngram) = query_term::make_ngram(ctx, &terms[term_idx - 1..=term_idx], &nbr_typos)? { - let ngram_idx = graph.add_node(&prev1, QueryNode::Term(ngram)); + let ngram_idx = add_node( + &mut nodes_data, + QueryNodeData::Term(ngram), + &prev1, + &mut successors, + &mut predecessors, + ); new_nodes.push(ngram_idx); } } @@ -164,53 +168,96 @@ impl QueryGraph { if let Some(ngram) = query_term::make_ngram(ctx, &terms[term_idx - 2..=term_idx], &nbr_typos)? { - let ngram_idx = graph.add_node(&prev2, QueryNode::Term(ngram)); + let ngram_idx = add_node( + &mut nodes_data, + QueryNodeData::Term(ngram), + &prev2, + &mut successors, + &mut predecessors, + ); new_nodes.push(ngram_idx); } } (prev0, prev1, prev2) = (new_nodes, prev0, prev1); } - graph.connect_to_node(&prev0, graph.end_node); + let root_node = Interned::new(root_node); + let end_node = Interned::new(end_node); + let mut nodes = FixedSizeInterner::new( + nodes_data.len() as u16, + QueryNode { + data: QueryNodeData::Deleted, + predecessors: SmallBitmap::new(nodes_data.len() as u16), + successors: SmallBitmap::new(nodes_data.len() as u16), + }, + ); + for (node_idx, ((node_data, predecessors), successors)) in nodes_data + .into_iter() + .zip(predecessors.into_iter()) + .zip(successors.into_iter()) + .enumerate() + { + let node = nodes.get_mut(Interned::new(node_idx as u16)); + node.data = node_data; + for x in predecessors { + node.predecessors.insert(Interned::new(x)); + } + for x in successors { + node.successors.insert(Interned::new(x)); + } + } + let mut graph = QueryGraph { root_node, end_node, nodes }; + + graph.connect_to_node( + prev0.into_iter().map(Interned::new).collect::>().as_slice(), + end_node, + ); + let empty_nodes = empty_nodes.into_iter().map(Interned::new).collect::>(); graph.remove_nodes_keep_edges(&empty_nodes); Ok(graph) } /// Remove the given nodes and all their edges from the query graph. - pub fn remove_nodes(&mut self, nodes: &[u16]) { - for &node in nodes { - self.nodes[node as usize] = QueryNode::Deleted; - let edges = self.edges[node as usize].clone(); - for pred in edges.predecessors.iter() { - self.edges[pred as usize].successors.remove(node); + pub fn remove_nodes(&mut self, nodes: &[Interned]) { + for &node_id in nodes { + let node = &self.nodes.get(node_id); + let old_node_pred = node.predecessors.clone(); + let old_node_succ = node.successors.clone(); + + for pred in old_node_pred.iter() { + self.nodes.get_mut(pred).successors.remove(node_id); } - for succ in edges.successors.iter() { - self.edges[succ as usize].predecessors.remove(node); + for succ in old_node_succ.iter() { + self.nodes.get_mut(succ).predecessors.remove(node_id); } - self.edges[node as usize] = Edges { - predecessors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT), - successors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT), - }; + + let node = self.nodes.get_mut(node_id); + node.data = QueryNodeData::Deleted; + node.predecessors.clear(); + node.successors.clear(); } } /// Remove the given nodes, connecting all their predecessors to all their successors. - pub fn remove_nodes_keep_edges(&mut self, nodes: &[u16]) { - for &node in nodes { - self.nodes[node as usize] = QueryNode::Deleted; - let edges = self.edges[node as usize].clone(); - for pred in edges.predecessors.iter() { - self.edges[pred as usize].successors.remove(node); - self.edges[pred as usize].successors.union(&edges.successors); + pub fn remove_nodes_keep_edges(&mut self, nodes: &[Interned]) { + for &node_id in nodes { + let node = self.nodes.get(node_id); + let old_node_pred = node.predecessors.clone(); + let old_node_succ = node.successors.clone(); + for pred in old_node_pred.iter() { + let pred_successors = &mut self.nodes.get_mut(pred).successors; + pred_successors.remove(node_id); + pred_successors.union(&old_node_succ); } - for succ in edges.successors.iter() { - self.edges[succ as usize].predecessors.remove(node); - self.edges[succ as usize].predecessors.union(&edges.predecessors); + for succ in old_node_succ.iter() { + let succ_predecessors = &mut self.nodes.get_mut(succ).predecessors; + succ_predecessors.remove(node_id); + succ_predecessors.union(&old_node_pred); } - self.edges[node as usize] = Edges { - predecessors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT), - successors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT), - }; + let node = self.nodes.get_mut(node_id); + node.data = QueryNodeData::Deleted; + node.predecessors.clear(); + node.successors.clear(); } } @@ -219,9 +266,8 @@ impl QueryGraph { /// Return `true` if any node was removed. pub fn remove_words_starting_at_position(&mut self, position: i8) -> bool { let mut nodes_to_remove_keeping_edges = vec![]; - for (node_idx, node) in self.nodes.iter().enumerate() { - let node_idx = node_idx as u16; - let QueryNode::Term(LocatedQueryTerm { value: _, positions }) = node else { continue }; + for (node_idx, node) in self.nodes.iter() { + let QueryNodeData::Term(LocatedQueryTerm { value: _, positions }) = &node.data else { continue }; if positions.start() == &position { nodes_to_remove_keeping_edges.push(node_idx); } @@ -238,13 +284,13 @@ impl QueryGraph { fn simplify(&mut self) { loop { let mut nodes_to_remove = vec![]; - for (node_idx, node) in self.nodes.iter().enumerate() { - if (!matches!(node, QueryNode::End | QueryNode::Deleted) - && self.edges[node_idx].successors.is_empty()) - || (!matches!(node, QueryNode::Start | QueryNode::Deleted) - && self.edges[node_idx].predecessors.is_empty()) + for (node_idx, node) in self.nodes.iter() { + if (!matches!(node.data, QueryNodeData::End | QueryNodeData::Deleted) + && node.successors.is_empty()) + || (!matches!(node.data, QueryNodeData::Start | QueryNodeData::Deleted) + && node.predecessors.is_empty()) { - nodes_to_remove.push(node_idx as u16); + nodes_to_remove.push(node_idx); } } if nodes_to_remove.is_empty() { @@ -255,3 +301,21 @@ impl QueryGraph { } } } + +fn add_node( + nodes_data: &mut Vec, + node_data: QueryNodeData, + from_nodes: &Vec, + successors: &mut Vec>, + predecessors: &mut Vec>, +) -> u16 { + successors.push(HashSet::new()); + predecessors.push(HashSet::new()); + let new_node_idx = nodes_data.len() as u16; + nodes_data.push(node_data); + for &from_node in from_nodes { + successors[from_node as usize].insert(new_node_idx); + predecessors[new_node_idx as usize].insert(from_node); + } + new_node_idx +} diff --git a/milli/src/search/new/query_term.rs b/milli/src/search/new/query_term.rs index c6cb81131..3272464f8 100644 --- a/milli/src/search/new/query_term.rs +++ b/milli/src/search/new/query_term.rs @@ -9,7 +9,7 @@ use heed::types::DecodeIgnore; use heed::RoTxn; use itertools::Itertools; -use super::interner::{Interned, Interner}; +use super::interner::{DedupInterner, Interned}; use super::SearchContext; use crate::search::fst_utils::{Complement, Intersection, StartsWith, Union}; use crate::search::{build_dfa, get_first}; @@ -22,7 +22,7 @@ pub struct Phrase { pub words: Vec>>, } impl Phrase { - pub fn description(&self, interner: &Interner) -> String { + pub fn description(&self, interner: &DedupInterner) -> String { self.words.iter().flatten().map(|w| interner.get(*w)).join(" ") } } @@ -60,8 +60,8 @@ pub struct QueryTerm { } impl QueryTerm { pub fn phrase( - word_interner: &mut Interner, - phrase_interner: &mut Interner, + word_interner: &mut DedupInterner, + phrase_interner: &mut DedupInterner, phrase: Phrase, ) -> Self { Self { @@ -78,7 +78,7 @@ impl QueryTerm { is_ngram: false, } } - pub fn empty(word_interner: &mut Interner, original: &str) -> Self { + pub fn empty(word_interner: &mut DedupInterner, original: &str) -> Self { Self { original: word_interner.insert(original.to_owned()), phrase: None, @@ -313,7 +313,7 @@ pub struct LocatedQueryTerm { impl LocatedQueryTerm { /// Return `true` iff the term is empty - pub fn is_empty(&self, interner: &Interner) -> bool { + pub fn is_empty(&self, interner: &DedupInterner) -> bool { interner.get(self.value).is_empty() } } diff --git a/milli/src/search/new/ranking_rule_graph/build.rs b/milli/src/search/new/ranking_rule_graph/build.rs index 286a98ab1..7ab08aceb 100644 --- a/milli/src/search/new/ranking_rule_graph/build.rs +++ b/milli/src/search/new/ranking_rule_graph/build.rs @@ -1,7 +1,7 @@ use std::collections::HashSet; use super::{Edge, RankingRuleGraph, RankingRuleGraphTrait}; -use crate::search::new::interner::Interner; +use crate::search::new::interner::{DedupInterner, Interner}; use crate::search::new::small_bitmap::SmallBitmap; use crate::search::new::{QueryGraph, SearchContext}; use crate::Result; @@ -15,40 +15,43 @@ impl RankingRuleGraph { /// Build the ranking rule graph from the given query graph pub fn build(ctx: &mut SearchContext, query_graph: QueryGraph) -> Result { - let QueryGraph { nodes: graph_nodes, edges: graph_edges, .. } = &query_graph; + let QueryGraph { nodes: graph_nodes, .. } = &query_graph; - let mut conditions_interner = Interner::default(); + let mut conditions_interner = DedupInterner::default(); - let mut edges_store = vec![]; - let mut edges_of_node = vec![]; + let mut edges_store = Interner::default(); + let mut edges_of_node = query_graph.nodes.map(|_| HashSet::new()); - for (source_idx, source_node) in graph_nodes.iter().enumerate() { - edges_of_node.push(HashSet::new()); - let new_edges = edges_of_node.last_mut().unwrap(); + for (source_id, source_node) in graph_nodes.iter() { + let new_edges = edges_of_node.get_mut(source_id); - for dest_idx in graph_edges[source_idx].successors.iter() { - let dest_node = &graph_nodes[dest_idx as usize]; + for dest_idx in source_node.successors.iter() { + let dest_node = graph_nodes.get(dest_idx); let edges = G::build_edges(ctx, &mut conditions_interner, source_node, dest_node)?; if edges.is_empty() { continue; } for (cost, condition) in edges { - edges_store.push(Some(Edge { - source_node: source_idx as u16, + let new_edge_id = edges_store.push(Some(Edge { + source_node: source_id, dest_node: dest_idx, cost, condition, })); - new_edges.insert(edges_store.len() as u16 - 1); + new_edges.insert(new_edge_id); } } } - let edges_of_node = edges_of_node - .into_iter() - .map(|edges| SmallBitmap::from_iter(edges.into_iter(), edges_store.len() as u16)) - .collect(); + let edges_store = edges_store.freeze(); + let edges_of_node = + edges_of_node.map(|edges| SmallBitmap::from_iter(edges.iter().copied(), &edges_store)); - Ok(RankingRuleGraph { query_graph, edges_store, edges_of_node, conditions_interner }) + Ok(RankingRuleGraph { + query_graph, + edges_store, + edges_of_node, + conditions_interner: conditions_interner.freeze(), + }) } } diff --git a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs index 529bb32c4..597aff661 100644 --- a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs +++ b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs @@ -3,8 +3,10 @@ use std::collections::btree_map::Entry; use std::collections::{BTreeMap, VecDeque}; -use super::empty_paths_cache::EmptyPathsCache; -use super::{RankingRuleGraph, RankingRuleGraphTrait}; +use super::empty_paths_cache::DeadEndPathCache; +use super::{EdgeCondition, RankingRuleGraph, RankingRuleGraphTrait}; +use crate::search::new::interner::{Interned, MappedInterner}; +use crate::search::new::query_graph::QueryNode; use crate::search::new::small_bitmap::SmallBitmap; use crate::Result; @@ -17,11 +19,11 @@ pub struct Path { impl RankingRuleGraph { pub fn visit_paths_of_cost( &mut self, - from: usize, + from: Interned, cost: u16, - all_distances: &[Vec<(u16, SmallBitmap)>], - empty_paths_cache: &mut EmptyPathsCache, - mut visit: impl FnMut(&[u16], &mut Self, &mut EmptyPathsCache) -> Result<()>, + all_distances: &MappedInterner)>, QueryNode>, + empty_paths_cache: &mut DeadEndPathCache, + mut visit: impl FnMut(&[u16], &mut Self, &mut DeadEndPathCache) -> Result<()>, ) -> Result<()> { let _ = self.visit_paths_of_cost_rec( from, @@ -30,76 +32,108 @@ impl RankingRuleGraph { empty_paths_cache, &mut visit, &mut vec![], - &mut SmallBitmap::new(self.edges_store.len() as u16), - empty_paths_cache.empty_edges.clone(), + &mut SmallBitmap::new(self.edges_store.len()), + &mut empty_paths_cache.conditions.clone(), )?; Ok(()) } pub fn visit_paths_of_cost_rec( &mut self, - from: usize, + from: Interned, cost: u16, - all_distances: &[Vec<(u16, SmallBitmap)>], - empty_paths_cache: &mut EmptyPathsCache, - visit: &mut impl FnMut(&[u16], &mut Self, &mut EmptyPathsCache) -> Result<()>, - prev_edges: &mut Vec, - cur_path: &mut SmallBitmap, - mut forbidden_edges: SmallBitmap, + all_distances: &MappedInterner)>, QueryNode>, + empty_paths_cache: &mut DeadEndPathCache, + visit: &mut impl FnMut(&[u16], &mut Self, &mut DeadEndPathCache) -> Result<()>, + prev_conditions: &mut Vec, + cur_path: &mut SmallBitmap, + forbidden_conditions: &mut SmallBitmap, ) -> Result { let mut any_valid = false; - let edges = self.edges_of_node[from].clone(); + let edges = self.edges_of_node.get(from).clone(); for edge_idx in edges.iter() { - let Some(edge) = self.edges_store[edge_idx as usize].as_ref() else { continue }; - if cost < edge.cost as u16 - || forbidden_edges.contains(edge_idx) - || !all_distances[edge.dest_node as usize].iter().any( - |(next_cost, necessary_edges)| { - (*next_cost == cost - edge.cost as u16) - && !forbidden_edges.intersects(necessary_edges) - }, - ) - { + let Some(edge) = self.edges_store.get(edge_idx).as_ref() else { continue }; + if cost < edge.cost as u16 { continue; } - cur_path.insert(edge_idx); - prev_edges.push(edge_idx); + let next_any_valid = match edge.condition { + EdgeCondition::Unconditional => { + if edge.dest_node == self.query_graph.end_node { + any_valid = true; + visit(prev_conditions, self, empty_paths_cache)?; + true + } else { + self.visit_paths_of_cost_rec( + edge.dest_node, + cost - edge.cost as u16, + all_distances, + empty_paths_cache, + visit, + prev_conditions, + cur_path, + forbidden_conditions, + )? + } + } + EdgeCondition::Conditional(condition) => { + if forbidden_conditions.contains(condition) + || !all_distances.get(edge.dest_node).iter().any( + |(next_cost, necessary_conditions)| { + (*next_cost == cost - edge.cost as u16) + && !forbidden_conditions.intersects(necessary_conditions) + }, + ) + { + continue; + } + cur_path.insert(condition); + // TODO: typed path set + prev_conditions.push(condition.into_inner()); - let mut new_forbidden_edges = forbidden_edges.clone(); - new_forbidden_edges.union(&empty_paths_cache.empty_couple_edges[edge_idx as usize]); - empty_paths_cache.empty_prefixes.final_edges_after_prefix(prev_edges, &mut |x| { - new_forbidden_edges.insert(x); - }); - - let next_any_valid = if edge.dest_node == self.query_graph.end_node { - any_valid = true; - visit(prev_edges, self, empty_paths_cache)?; - true - } else { - self.visit_paths_of_cost_rec( - edge.dest_node as usize, - cost - edge.cost as u16, - all_distances, - empty_paths_cache, - visit, - prev_edges, - cur_path, - new_forbidden_edges, - )? + let mut new_forbidden_conditions = forbidden_conditions.clone(); + new_forbidden_conditions + .union(empty_paths_cache.condition_couples.get(condition)); + empty_paths_cache.prefixes.final_edges_after_prefix( + prev_conditions, + &mut |x| { + new_forbidden_conditions.insert(Interned::new(x)); + }, + ); + let next_any_valid = if edge.dest_node == self.query_graph.end_node { + any_valid = true; + visit(prev_conditions, self, empty_paths_cache)?; + true + } else { + self.visit_paths_of_cost_rec( + edge.dest_node, + cost - edge.cost as u16, + all_distances, + empty_paths_cache, + visit, + prev_conditions, + cur_path, + &mut new_forbidden_conditions, + )? + }; + cur_path.remove(condition); + prev_conditions.pop(); + next_any_valid + } }; any_valid |= next_any_valid; - cur_path.remove(edge_idx); - prev_edges.pop(); + if next_any_valid { - if empty_paths_cache.path_is_empty(prev_edges, cur_path) { + if empty_paths_cache.path_is_dead_end(prev_conditions, cur_path) { return Ok(any_valid); } - forbidden_edges.union(&empty_paths_cache.empty_edges); - for edge in prev_edges.iter() { - forbidden_edges.union(&empty_paths_cache.empty_couple_edges[*edge as usize]); + forbidden_conditions.union(&empty_paths_cache.conditions); + for prev_condition in prev_conditions.iter() { + forbidden_conditions.union( + empty_paths_cache.condition_couples.get(Interned::new(*prev_condition)), + ); } - empty_paths_cache.empty_prefixes.final_edges_after_prefix(prev_edges, &mut |x| { - forbidden_edges.insert(x); + empty_paths_cache.prefixes.final_edges_after_prefix(prev_conditions, &mut |x| { + forbidden_conditions.insert(Interned::new(x)); }); } } @@ -107,36 +141,41 @@ impl RankingRuleGraph { Ok(any_valid) } - pub fn initialize_distances_with_necessary_edges(&self) -> Vec> { - let mut distances_to_end: Vec> = - vec![vec![]; self.query_graph.nodes.len()]; - let mut enqueued = SmallBitmap::new(self.query_graph.nodes.len() as u16); + pub fn initialize_distances_with_necessary_edges( + &self, + ) -> MappedInterner)>, QueryNode> { + let mut distances_to_end = self.query_graph.nodes.map(|_| vec![]); + let mut enqueued = SmallBitmap::new(self.query_graph.nodes.len()); let mut node_stack = VecDeque::new(); - distances_to_end[self.query_graph.end_node as usize] = - vec![(0, SmallBitmap::new(self.edges_store.len() as u16))]; + *distances_to_end.get_mut(self.query_graph.end_node) = + vec![(0, SmallBitmap::for_interned_values_in(&self.conditions_interner))]; - for prev_node in - self.query_graph.edges[self.query_graph.end_node as usize].predecessors.iter() - { - node_stack.push_back(prev_node as usize); + for prev_node in self.query_graph.nodes.get(self.query_graph.end_node).predecessors.iter() { + node_stack.push_back(prev_node); enqueued.insert(prev_node); } while let Some(cur_node) = node_stack.pop_front() { - let mut self_distances = BTreeMap::::new(); + let mut self_distances = BTreeMap::>::new(); - let cur_node_edges = &self.edges_of_node[cur_node]; + let cur_node_edges = &self.edges_of_node.get(cur_node); for edge_idx in cur_node_edges.iter() { - let edge = self.edges_store[edge_idx as usize].as_ref().unwrap(); + let edge = self.edges_store.get(edge_idx).as_ref().unwrap(); + let condition = match edge.condition { + EdgeCondition::Unconditional => None, + EdgeCondition::Conditional(condition) => Some(condition), + }; let succ_node = edge.dest_node; - let succ_distances = &distances_to_end[succ_node as usize]; - for (succ_distance, succ_necessary_edges) in succ_distances { - let potential_necessary_edges = SmallBitmap::from_iter( - std::iter::once(edge_idx).chain(succ_necessary_edges.iter()), - self.edges_store.len() as u16, - ); + let succ_distances = distances_to_end.get(succ_node); + for (succ_distance, succ_necessary_conditions) in succ_distances { + let mut potential_necessary_edges = + SmallBitmap::for_interned_values_in(&self.conditions_interner); + for condition in condition.into_iter().chain(succ_necessary_conditions.iter()) { + potential_necessary_edges.insert(condition); + } + match self_distances.entry(edge.cost as u16 + succ_distance) { Entry::Occupied(mut prev_necessary_edges) => { prev_necessary_edges.get_mut().intersection(&potential_necessary_edges); @@ -147,10 +186,14 @@ impl RankingRuleGraph { } } } - distances_to_end[cur_node] = self_distances.into_iter().collect(); - for prev_node in self.query_graph.edges[cur_node].predecessors.iter() { + let distances_to_end_cur_node = distances_to_end.get_mut(cur_node); + for (cost, necessary_edges) in self_distances.iter() { + distances_to_end_cur_node.push((*cost, necessary_edges.clone())); + } + *distances_to_end.get_mut(cur_node) = self_distances.into_iter().collect(); + for prev_node in self.query_graph.nodes.get(cur_node).predecessors.iter() { if !enqueued.contains(prev_node) { - node_stack.push_back(prev_node as usize); + node_stack.push_back(prev_node); enqueued.insert(prev_node); } } diff --git a/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs b/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs index 5da3de326..b3426619b 100644 --- a/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs +++ b/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs @@ -9,17 +9,17 @@ use crate::search::new::SearchContext; use crate::Result; /// A cache storing the document ids associated with each ranking rule edge -pub struct EdgeConditionsCache { +pub struct EdgeConditionDocIdsCache { // TODO: should be FxHashMap, RoaringBitmap> pub cache: FxHashMap, RoaringBitmap>, _phantom: PhantomData, } -impl Default for EdgeConditionsCache { +impl Default for EdgeConditionDocIdsCache { fn default() -> Self { Self { cache: Default::default(), _phantom: Default::default() } } } -impl EdgeConditionsCache { +impl EdgeConditionDocIdsCache { /// Retrieve the document ids for the given edge condition. /// /// If the cache does not yet contain these docids, they are computed diff --git a/milli/src/search/new/ranking_rule_graph/empty_paths_cache.rs b/milli/src/search/new/ranking_rule_graph/empty_paths_cache.rs index 3178cfe27..3b518bc9b 100644 --- a/milli/src/search/new/ranking_rule_graph/empty_paths_cache.rs +++ b/milli/src/search/new/ranking_rule_graph/empty_paths_cache.rs @@ -1,59 +1,82 @@ -use super::path_set::PathSet; -use crate::search::new::small_bitmap::SmallBitmap; +use super::{path_set::PathSet, RankingRuleGraphTrait}; +use crate::search::new::{ + interner::{FixedSizeInterner, Interned, MappedInterner}, + small_bitmap::SmallBitmap, +}; /// A cache which stores sufficient conditions for a path /// to resolve to an empty set of candidates within the current /// universe. -#[derive(Clone)] -pub struct EmptyPathsCache { - /// The set of edge indexes that resolve to no documents. - pub empty_edges: SmallBitmap, +pub struct DeadEndPathCache { + /// The set of edge conditions that resolve to no documents. + pub conditions: SmallBitmap, /// A set of path prefixes that resolve to no documents. - pub empty_prefixes: PathSet, - /// A set of empty couples of edge indexes that resolve to no documents. - pub empty_couple_edges: Vec, + pub prefixes: PathSet, + /// A set of empty couples of edge conditions that resolve to no documents. + pub condition_couples: MappedInterner, G::EdgeCondition>, } -impl EmptyPathsCache { - /// Create a new cache for a ranking rule graph containing at most `all_edges_len` edges. - pub fn new(all_edges_len: u16) -> Self { +impl Clone for DeadEndPathCache { + fn clone(&self) -> Self { Self { - empty_edges: SmallBitmap::new(all_edges_len), - empty_prefixes: PathSet::default(), - empty_couple_edges: vec![SmallBitmap::new(all_edges_len); all_edges_len as usize], + conditions: self.conditions.clone(), + prefixes: self.prefixes.clone(), + condition_couples: self.condition_couples.clone(), + } + } +} + +impl DeadEndPathCache { + /// Create a new cache for a ranking rule graph containing at most `all_edges_len` edges. + pub fn new(all_edge_conditions: &FixedSizeInterner) -> Self { + Self { + conditions: SmallBitmap::for_interned_values_in(all_edge_conditions), + prefixes: PathSet::default(), + condition_couples: all_edge_conditions + .map(|_| SmallBitmap::for_interned_values_in(all_edge_conditions)), } } /// Store in the cache that every path containing the given edge resolves to no documents. - pub fn forbid_edge(&mut self, edge_idx: u16) { - self.empty_edges.insert(edge_idx); - self.empty_couple_edges[edge_idx as usize].clear(); - self.empty_prefixes.remove_edge(&edge_idx); - for edges2 in self.empty_couple_edges.iter_mut() { - edges2.remove(edge_idx); + pub fn add_condition(&mut self, condition: Interned) { + self.conditions.insert(condition); + self.condition_couples.get_mut(condition).clear(); + self.prefixes.remove_edge(condition.into_inner()); // TODO: typed PathSet + for (_, edges2) in self.condition_couples.iter_mut() { + edges2.remove(condition); } } /// Store in the cache that every path containing the given prefix resolves to no documents. - pub fn forbid_prefix(&mut self, prefix: &[u16]) { - self.empty_prefixes.insert(prefix.iter().copied()); + pub fn add_prefix(&mut self, prefix: &[u16]) { + // TODO: typed PathSet + self.prefixes.insert(prefix.iter().copied()); } /// Store in the cache that every path containing the two given edges resolves to no documents. - pub fn forbid_couple_edges(&mut self, edge1: u16, edge2: u16) { - self.empty_couple_edges[edge1 as usize].insert(edge2); + pub fn add_condition_couple( + &mut self, + edge1: Interned, + edge2: Interned, + ) { + self.condition_couples.get_mut(edge1).insert(edge2); } /// Returns true if the cache can determine that the given path resolves to no documents. - pub fn path_is_empty(&self, path: &[u16], path_bitmap: &SmallBitmap) -> bool { - if path_bitmap.intersects(&self.empty_edges) { + pub fn path_is_dead_end( + &self, + path: &[u16], + path_bitmap: &SmallBitmap, + ) -> bool { + if path_bitmap.intersects(&self.conditions) { return true; } for edge in path.iter() { - let forbidden_other_edges = &self.empty_couple_edges[*edge as usize]; + // TODO: typed path + let forbidden_other_edges = self.condition_couples.get(Interned::new(*edge)); if path_bitmap.intersects(forbidden_other_edges) { return true; } } - if self.empty_prefixes.contains_prefix_of_path(path) { + if self.prefixes.contains_prefix_of_path(path) { return true; } false diff --git a/milli/src/search/new/ranking_rule_graph/mod.rs b/milli/src/search/new/ranking_rule_graph/mod.rs index ee93bee13..7b82dc0a1 100644 --- a/milli/src/search/new/ranking_rule_graph/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/mod.rs @@ -18,13 +18,13 @@ mod typo; use std::hash::Hash; -pub use edge_docids_cache::EdgeConditionsCache; -pub use empty_paths_cache::EmptyPathsCache; -pub use proximity::ProximityGraph; +pub use edge_docids_cache::EdgeConditionDocIdsCache; +pub use empty_paths_cache::DeadEndPathCache; +pub use proximity::{ProximityEdge, ProximityGraph}; use roaring::RoaringBitmap; -pub use typo::TypoGraph; +pub use typo::{TypoEdge, TypoGraph}; -use super::interner::{Interned, Interner}; +use super::interner::{DedupInterner, FixedSizeInterner, Interned, MappedInterner}; use super::logger::SearchLogger; use super::small_bitmap::SmallBitmap; use super::{QueryGraph, QueryNode, SearchContext}; @@ -63,8 +63,8 @@ impl Clone for EdgeCondition { /// 3. The condition associated with it #[derive(Clone)] pub struct Edge { - pub source_node: u16, - pub dest_node: u16, + pub source_node: Interned, + pub dest_node: Interned, pub cost: u8, pub condition: EdgeCondition, } @@ -96,7 +96,7 @@ pub trait RankingRuleGraphTrait: Sized { /// (with [`build_step_visit_source_node`](RankingRuleGraphTrait::build_step_visit_source_node)) to `dest_node`. fn build_edges<'ctx>( ctx: &mut SearchContext<'ctx>, - conditions_interner: &mut Interner, + conditions_interner: &mut DedupInterner, source_node: &QueryNode, dest_node: &QueryNode, ) -> Result)>>; @@ -104,9 +104,9 @@ pub trait RankingRuleGraphTrait: Sized { fn log_state( graph: &RankingRuleGraph, paths: &[Vec], - empty_paths_cache: &EmptyPathsCache, + empty_paths_cache: &DeadEndPathCache, universe: &RoaringBitmap, - distances: &[Vec<(u16, SmallBitmap)>], + distances: &MappedInterner)>, QueryNode>, cost: u16, logger: &mut dyn SearchLogger, ); @@ -118,9 +118,9 @@ pub trait RankingRuleGraphTrait: Sized { /// but replacing the edges. pub struct RankingRuleGraph { pub query_graph: QueryGraph, - pub edges_store: Vec>>, - pub edges_of_node: Vec, - pub conditions_interner: Interner, + pub edges_store: FixedSizeInterner>>, + pub edges_of_node: MappedInterner>>, QueryNode>, + pub conditions_interner: FixedSizeInterner, } impl Clone for RankingRuleGraph { fn clone(&self) -> Self { @@ -133,13 +133,20 @@ impl Clone for RankingRuleGraph { } } impl RankingRuleGraph { - /// Remove the given edge from the ranking rule graph - pub fn remove_ranking_rule_edge(&mut self, edge_index: u16) { - let edge_opt = &mut self.edges_store[edge_index as usize]; - let Some(edge) = &edge_opt else { return }; - let (source_node, _dest_node) = (edge.source_node, edge.dest_node); - *edge_opt = None; - - self.edges_of_node[source_node as usize].remove(edge_index); + /// Remove all edges with the given condition + pub fn remove_edges_with_condition(&mut self, condition_to_remove: Interned) { + for (edge_id, edge_opt) in self.edges_store.iter_mut() { + let Some(edge) = edge_opt.as_mut() else { continue }; + match edge.condition { + EdgeCondition::Unconditional => continue, + EdgeCondition::Conditional(condition) => { + if condition == condition_to_remove { + let (source_node, _dest_node) = (edge.source_node, edge.dest_node); + *edge_opt = None; + self.edges_of_node.get_mut(source_node).remove(edge_id); + } + } + } + } } } diff --git a/milli/src/search/new/ranking_rule_graph/path_set.rs b/milli/src/search/new/ranking_rule_graph/path_set.rs index b601f28d9..d5bab6c14 100644 --- a/milli/src/search/new/ranking_rule_graph/path_set.rs +++ b/milli/src/search/new/ranking_rule_graph/path_set.rs @@ -27,10 +27,10 @@ impl PathSet { } } - pub fn remove_edge(&mut self, forbidden_edge: &u16) { + pub fn remove_edge(&mut self, forbidden_edge: u16) { let mut i = 0; while i < self.nodes.len() { - let should_remove = if &self.nodes[i].0 == forbidden_edge { + let should_remove = if self.nodes[i].0 == forbidden_edge { true } else if !self.nodes[i].1.nodes.is_empty() { self.nodes[i].1.remove_edge(forbidden_edge); diff --git a/milli/src/search/new/ranking_rule_graph/proximity/build.rs b/milli/src/search/new/ranking_rule_graph/proximity/build.rs index b8042c408..556b3cb2b 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/build.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/build.rs @@ -3,7 +3,8 @@ use std::collections::BTreeMap; use super::ProximityEdge; use crate::search::new::db_cache::DatabaseCache; -use crate::search::new::interner::{Interned, Interner}; +use crate::search::new::interner::{DedupInterner, Interned}; +use crate::search::new::query_graph::QueryNodeData; use crate::search::new::query_term::{LocatedQueryTerm, Phrase, QueryTerm}; use crate::search::new::ranking_rule_graph::proximity::WordPair; use crate::search::new::ranking_rule_graph::EdgeCondition; @@ -13,7 +14,7 @@ use heed::RoTxn; fn last_word_of_term_iter<'t>( t: &'t QueryTerm, - phrase_interner: &'t Interner, + phrase_interner: &'t DedupInterner, ) -> impl Iterator>, Interned)> + 't { t.all_single_words_except_prefix_db().map(|w| (None, w)).chain(t.all_phrases().flat_map( move |p| { @@ -24,7 +25,7 @@ fn last_word_of_term_iter<'t>( } fn first_word_of_term_iter<'t>( t: &'t QueryTerm, - phrase_interner: &'t Interner, + phrase_interner: &'t DedupInterner, ) -> impl Iterator, Option>)> + 't { t.all_single_words_except_prefix_db().map(|w| (w, None)).chain(t.all_phrases().flat_map( move |p| { @@ -36,7 +37,7 @@ fn first_word_of_term_iter<'t>( pub fn build_edges<'ctx>( ctx: &mut SearchContext<'ctx>, - conditions_interner: &mut Interner, + conditions_interner: &mut DedupInterner, from_node: &QueryNode, to_node: &QueryNode, ) -> Result)>> { @@ -50,19 +51,19 @@ pub fn build_edges<'ctx>( term_docids: _, } = ctx; - let (left_term, left_end_position) = match from_node { - QueryNode::Term(LocatedQueryTerm { value, positions }) => { + let (left_term, left_end_position) = match &from_node.data { + QueryNodeData::Term(LocatedQueryTerm { value, positions }) => { (term_interner.get(*value), *positions.end()) } - QueryNode::Deleted => return Ok(vec![]), - QueryNode::Start => return Ok(vec![(0, EdgeCondition::Unconditional)]), - QueryNode::End => return Ok(vec![]), + QueryNodeData::Deleted => return Ok(vec![]), + QueryNodeData::Start => return Ok(vec![(0, EdgeCondition::Unconditional)]), + QueryNodeData::End => return Ok(vec![]), }; - let right_term = match &to_node { - QueryNode::End => return Ok(vec![(0, EdgeCondition::Unconditional)]), - QueryNode::Deleted | QueryNode::Start => return Ok(vec![]), - QueryNode::Term(term) => term, + let right_term = match &to_node.data { + QueryNodeData::End => return Ok(vec![(0, EdgeCondition::Unconditional)]), + QueryNodeData::Deleted | QueryNodeData::Start => return Ok(vec![]), + QueryNodeData::Term(term) => term, }; let LocatedQueryTerm { value: right_value, positions: right_positions } = right_term; @@ -145,7 +146,7 @@ fn add_prefix_edges<'ctx>( index: &mut &crate::Index, txn: &'ctx RoTxn, db_cache: &mut DatabaseCache<'ctx>, - word_interner: &mut Interner, + word_interner: &mut DedupInterner, right_ngram_length: usize, left_word: Interned, right_prefix: Interned, @@ -207,7 +208,7 @@ fn add_non_prefix_edges<'ctx>( index: &mut &crate::Index, txn: &'ctx RoTxn, db_cache: &mut DatabaseCache<'ctx>, - word_interner: &mut Interner, + word_interner: &mut DedupInterner, right_ngram_length: usize, word1: Interned, word2: Interned, diff --git a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs index 2cfee0b65..2d226cfc7 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs @@ -3,9 +3,9 @@ pub mod compute_docids; use roaring::RoaringBitmap; -use super::empty_paths_cache::EmptyPathsCache; +use super::empty_paths_cache::DeadEndPathCache; use super::{EdgeCondition, RankingRuleGraphTrait}; -use crate::search::new::interner::{Interned, Interner}; +use crate::search::new::interner::{DedupInterner, Interned, MappedInterner}; use crate::search::new::logger::SearchLogger; use crate::search::new::query_term::Phrase; use crate::search::new::small_bitmap::SmallBitmap; @@ -56,7 +56,7 @@ impl RankingRuleGraphTrait for ProximityGraph { fn build_edges<'ctx>( ctx: &mut SearchContext<'ctx>, - conditions_interner: &mut Interner, + conditions_interner: &mut DedupInterner, source_node: &QueryNode, dest_node: &QueryNode, ) -> Result)>> { @@ -66,19 +66,12 @@ impl RankingRuleGraphTrait for ProximityGraph { fn log_state( graph: &super::RankingRuleGraph, paths: &[Vec], - empty_paths_cache: &EmptyPathsCache, + empty_paths_cache: &DeadEndPathCache, universe: &RoaringBitmap, - distances: &[Vec<(u16, SmallBitmap)>], + distances: &MappedInterner)>, QueryNode>, cost: u16, logger: &mut dyn SearchLogger, ) { - logger.log_proximity_state( - graph, - paths, - empty_paths_cache, - universe, - distances.to_vec(), - cost, - ); + logger.log_proximity_state(graph, paths, empty_paths_cache, universe, distances, cost); } } diff --git a/milli/src/search/new/ranking_rule_graph/typo/mod.rs b/milli/src/search/new/ranking_rule_graph/typo/mod.rs index 6b832f9b2..c0404d391 100644 --- a/milli/src/search/new/ranking_rule_graph/typo/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/typo/mod.rs @@ -1,9 +1,10 @@ use roaring::RoaringBitmap; -use super::empty_paths_cache::EmptyPathsCache; +use super::empty_paths_cache::DeadEndPathCache; use super::{EdgeCondition, RankingRuleGraph, RankingRuleGraphTrait}; -use crate::search::new::interner::{Interned, Interner}; +use crate::search::new::interner::{DedupInterner, Interned, MappedInterner}; use crate::search::new::logger::SearchLogger; +use crate::search::new::query_graph::QueryNodeData; use crate::search::new::query_term::{LocatedQueryTerm, QueryTerm}; use crate::search::new::small_bitmap::SmallBitmap; use crate::search::new::{QueryGraph, QueryNode, SearchContext}; @@ -55,13 +56,13 @@ impl RankingRuleGraphTrait for TypoGraph { fn build_edges<'ctx>( ctx: &mut SearchContext<'ctx>, - conditions_interner: &mut Interner, + conditions_interner: &mut DedupInterner, _from_node: &QueryNode, to_node: &QueryNode, ) -> Result)>> { let SearchContext { term_interner, .. } = ctx; - match to_node { - QueryNode::Term(LocatedQueryTerm { value, positions }) => { + match &to_node.data { + QueryNodeData::Term(LocatedQueryTerm { value, positions }) => { let mut edges = vec![]; // Ngrams have a base typo cost // 2-gram -> equivalent to 1 typo @@ -130,20 +131,20 @@ impl RankingRuleGraphTrait for TypoGraph { } Ok(edges) } - QueryNode::End => Ok(vec![(0, EdgeCondition::Unconditional)]), - QueryNode::Deleted | QueryNode::Start => panic!(), + QueryNodeData::End => Ok(vec![(0, EdgeCondition::Unconditional)]), + QueryNodeData::Deleted | QueryNodeData::Start => panic!(), } } fn log_state( graph: &RankingRuleGraph, paths: &[Vec], - empty_paths_cache: &EmptyPathsCache, + empty_paths_cache: &DeadEndPathCache, universe: &RoaringBitmap, - distances: &[Vec<(u16, SmallBitmap)>], + distances: &MappedInterner)>, QueryNode>, cost: u16, logger: &mut dyn SearchLogger, ) { - logger.log_typo_state(graph, paths, empty_paths_cache, universe, distances.to_vec(), cost); + logger.log_typo_state(graph, paths, empty_paths_cache, universe, distances, cost); } } diff --git a/milli/src/search/new/ranking_rules.rs b/milli/src/search/new/ranking_rules.rs index 5e5da8716..7549cfff7 100644 --- a/milli/src/search/new/ranking_rules.rs +++ b/milli/src/search/new/ranking_rules.rs @@ -186,6 +186,7 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( // anything, just extend the results and go back to the parent ranking rule. if ranking_rule_universes[cur_ranking_rule_index].len() <= 1 { maybe_add_to_results!(&ranking_rule_universes[cur_ranking_rule_index]); + ranking_rule_universes[cur_ranking_rule_index].clear(); back!(); continue; } diff --git a/milli/src/search/new/resolve_query_graph.rs b/milli/src/search/new/resolve_query_graph.rs index 5ce6ecec2..2f941098d 100644 --- a/milli/src/search/new/resolve_query_graph.rs +++ b/milli/src/search/new/resolve_query_graph.rs @@ -7,11 +7,11 @@ use heed::{BytesDecode, RoTxn}; use roaring::RoaringBitmap; use super::db_cache::DatabaseCache; -use super::interner::{Interned, Interner}; -use super::query_graph::QUERY_GRAPH_NODE_LENGTH_LIMIT; +use super::interner::{DedupInterner, Interned}; +use super::query_graph::QueryNodeData; use super::query_term::{Phrase, QueryTerm}; use super::small_bitmap::SmallBitmap; -use super::{QueryGraph, QueryNode, SearchContext}; +use super::{QueryGraph, SearchContext}; use crate::{CboRoaringBitmapCodec, Index, Result, RoaringBitmapCodec}; #[derive(Default)] @@ -26,8 +26,8 @@ impl QueryTermDocIdsCache { index: &Index, txn: &'ctx RoTxn, db_cache: &mut DatabaseCache<'ctx>, - word_interner: &Interner, - phrase_interner: &Interner, + word_interner: &DedupInterner, + phrase_interner: &DedupInterner, phrase: Interned, ) -> Result<&'s RoaringBitmap> { if self.phrases.contains_key(&phrase) { @@ -44,9 +44,9 @@ impl QueryTermDocIdsCache { index: &Index, txn: &'ctx RoTxn, db_cache: &mut DatabaseCache<'ctx>, - word_interner: &Interner, - term_interner: &Interner, - phrase_interner: &Interner, + word_interner: &DedupInterner, + term_interner: &DedupInterner, + phrase_interner: &DedupInterner, term_interned: Interned, ) -> Result<&'s RoaringBitmap> { if self.terms.contains_key(&term_interned) { @@ -105,28 +105,27 @@ pub fn resolve_query_graph<'ctx>( // TODO: there is a faster way to compute this big // roaring bitmap expression - let mut nodes_resolved = SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT); - let mut path_nodes_docids = vec![RoaringBitmap::new(); q.nodes.len()]; + let mut nodes_resolved = SmallBitmap::for_interned_values_in(&q.nodes); + let mut path_nodes_docids = q.nodes.map(|_| RoaringBitmap::new()); let mut next_nodes_to_visit = VecDeque::new(); next_nodes_to_visit.push_back(q.root_node); - while let Some(node) = next_nodes_to_visit.pop_front() { - let predecessors = &q.edges[node as usize].predecessors; + while let Some(node_id) = next_nodes_to_visit.pop_front() { + let node = q.nodes.get(node_id); + let predecessors = &node.predecessors; if !predecessors.is_subset(&nodes_resolved) { - next_nodes_to_visit.push_back(node); + next_nodes_to_visit.push_back(node_id); continue; } // Take union of all predecessors let mut predecessors_docids = RoaringBitmap::new(); for p in predecessors.iter() { - predecessors_docids |= &path_nodes_docids[p as usize]; + predecessors_docids |= path_nodes_docids.get(p); } - let n = &q.nodes[node as usize]; - - let node_docids = match n { - QueryNode::Term(located_term) => { + let node_docids = match &node.data { + QueryNodeData::Term(located_term) => { let term_docids = query_term_docids.get_query_term_docids( index, txn, @@ -138,26 +137,26 @@ pub fn resolve_query_graph<'ctx>( )?; predecessors_docids & term_docids } - QueryNode::Deleted => { + QueryNodeData::Deleted => { panic!() } - QueryNode::Start => universe.clone(), - QueryNode::End => { + QueryNodeData::Start => universe.clone(), + QueryNodeData::End => { return Ok(predecessors_docids); } }; - nodes_resolved.insert(node); - path_nodes_docids[node as usize] = node_docids; + nodes_resolved.insert(node_id); + *path_nodes_docids.get_mut(node_id) = node_docids; - for succ in q.edges[node as usize].successors.iter() { + for succ in node.successors.iter() { if !next_nodes_to_visit.contains(&succ) && !nodes_resolved.contains(succ) { next_nodes_to_visit.push_back(succ); } } - for prec in q.edges[node as usize].predecessors.iter() { - if q.edges[prec as usize].successors.is_subset(&nodes_resolved) { - path_nodes_docids[prec as usize].clear(); + for prec in node.predecessors.iter() { + if q.nodes.get(prec).successors.is_subset(&nodes_resolved) { + path_nodes_docids.get_mut(prec).clear(); } } } @@ -168,8 +167,8 @@ pub fn resolve_phrase<'ctx>( index: &Index, txn: &'ctx RoTxn, db_cache: &mut DatabaseCache<'ctx>, - word_interner: &Interner, - phrase_interner: &Interner, + word_interner: &DedupInterner, + phrase_interner: &DedupInterner, phrase: Interned, ) -> Result { let Phrase { words } = phrase_interner.get(phrase).clone(); diff --git a/milli/src/search/new/small_bitmap.rs b/milli/src/search/new/small_bitmap.rs index 48a2e02fc..7ab2b61ae 100644 --- a/milli/src/search/new/small_bitmap.rs +++ b/milli/src/search/new/small_bitmap.rs @@ -1,9 +1,85 @@ +use std::marker::PhantomData; + +use super::interner::{FixedSizeInterner, Interned}; + +pub struct SmallBitmap { + internal: SmallBitmapInternal, + _phantom: PhantomData, +} +impl Clone for SmallBitmap { + fn clone(&self) -> Self { + Self { internal: self.internal.clone(), _phantom: PhantomData } + } +} +impl SmallBitmap { + pub fn for_interned_values_in(interner: &FixedSizeInterner) -> Self { + Self::new(interner.len()) + } + pub fn new(universe_length: u16) -> Self { + if universe_length <= 64 { + Self { internal: SmallBitmapInternal::Tiny(0), _phantom: PhantomData } + } else { + Self { + internal: SmallBitmapInternal::Small( + vec![0; 1 + universe_length as usize / 64].into_boxed_slice(), + ), + _phantom: PhantomData, + } + } + } + pub fn from_iter( + xs: impl Iterator>, + for_interner: &FixedSizeInterner, + ) -> Self { + Self { + internal: SmallBitmapInternal::from_iter( + xs.map(|x| x.into_inner()), + for_interner.len(), + ), + _phantom: PhantomData, + } + } + pub fn is_empty(&self) -> bool { + self.internal.is_empty() + } + pub fn clear(&mut self) { + self.internal.clear() + } + pub fn contains(&self, x: Interned) -> bool { + self.internal.contains(x.into_inner()) + } + pub fn insert(&mut self, x: Interned) { + self.internal.insert(x.into_inner()) + } + pub fn remove(&mut self, x: Interned) { + self.internal.remove(x.into_inner()) + } + + pub fn intersection(&mut self, other: &Self) { + self.internal.intersection(&other.internal) + } + pub fn union(&mut self, other: &Self) { + self.internal.union(&other.internal) + } + pub fn subtract(&mut self, other: &Self) { + self.internal.subtract(&other.internal) + } + pub fn is_subset(&self, other: &Self) -> bool { + self.internal.is_subset(&other.internal) + } + pub fn intersects(&self, other: &Self) -> bool { + self.internal.intersects(&other.internal) + } + pub fn iter(&self) -> impl Iterator> + '_ { + self.internal.iter().map(|x| Interned::new(x)) + } +} #[derive(Clone)] -pub enum SmallBitmap { +pub enum SmallBitmapInternal { Tiny(u64), Small(Box<[u64]>), } -impl SmallBitmap { +impl SmallBitmapInternal { pub fn new(universe_length: u16) -> Self { if universe_length <= 64 { Self::Tiny(0) @@ -20,8 +96,8 @@ impl SmallBitmap { } pub fn is_empty(&self) -> bool { match self { - SmallBitmap::Tiny(set) => *set == 0, - SmallBitmap::Small(sets) => { + SmallBitmapInternal::Tiny(set) => *set == 0, + SmallBitmapInternal::Small(sets) => { for set in sets.iter() { if *set != 0 { return false; @@ -33,8 +109,8 @@ impl SmallBitmap { } pub fn clear(&mut self) { match self { - SmallBitmap::Tiny(set) => *set = 0, - SmallBitmap::Small(sets) => { + SmallBitmapInternal::Tiny(set) => *set = 0, + SmallBitmapInternal::Small(sets) => { for set in sets.iter_mut() { *set = 0; } @@ -43,8 +119,8 @@ impl SmallBitmap { } pub fn contains(&self, mut x: u16) -> bool { let set = match self { - SmallBitmap::Tiny(set) => *set, - SmallBitmap::Small(set) => { + SmallBitmapInternal::Tiny(set) => *set, + SmallBitmapInternal::Small(set) => { let idx = x / 64; x %= 64; set[idx as usize] @@ -54,8 +130,8 @@ impl SmallBitmap { } pub fn insert(&mut self, mut x: u16) { let set = match self { - SmallBitmap::Tiny(set) => set, - SmallBitmap::Small(set) => { + SmallBitmapInternal::Tiny(set) => set, + SmallBitmapInternal::Small(set) => { let idx = x / 64; x %= 64; &mut set[idx as usize] @@ -65,8 +141,8 @@ impl SmallBitmap { } pub fn remove(&mut self, mut x: u16) { let set = match self { - SmallBitmap::Tiny(set) => set, - SmallBitmap::Small(set) => { + SmallBitmapInternal::Tiny(set) => set, + SmallBitmapInternal::Small(set) => { let idx = x / 64; x %= 64; &mut set[idx as usize] @@ -75,20 +151,20 @@ impl SmallBitmap { *set &= !(0b1 << x); } - pub fn intersection(&mut self, other: &SmallBitmap) { + pub fn intersection(&mut self, other: &SmallBitmapInternal) { self.apply_op(other, |a, b| *a &= b); } - pub fn union(&mut self, other: &SmallBitmap) { + pub fn union(&mut self, other: &SmallBitmapInternal) { self.apply_op(other, |a, b| *a |= b); } - pub fn subtract(&mut self, other: &SmallBitmap) { + pub fn subtract(&mut self, other: &SmallBitmapInternal) { self.apply_op(other, |a, b| *a &= !b); } - pub fn apply_op(&mut self, other: &SmallBitmap, op: impl Fn(&mut u64, u64)) { + pub fn apply_op(&mut self, other: &SmallBitmapInternal, op: impl Fn(&mut u64, u64)) { match (self, other) { - (SmallBitmap::Tiny(a), SmallBitmap::Tiny(b)) => op(a, *b), - (SmallBitmap::Small(a), SmallBitmap::Small(b)) => { + (SmallBitmapInternal::Tiny(a), SmallBitmapInternal::Tiny(b)) => op(a, *b), + (SmallBitmapInternal::Small(a), SmallBitmapInternal::Small(b)) => { assert!(a.len() == b.len(),); for (a, b) in a.iter_mut().zip(b.iter()) { op(a, *b); @@ -99,10 +175,14 @@ impl SmallBitmap { } } } - pub fn all_satisfy_op(&self, other: &SmallBitmap, op: impl Fn(u64, u64) -> bool) -> bool { + pub fn all_satisfy_op( + &self, + other: &SmallBitmapInternal, + op: impl Fn(u64, u64) -> bool, + ) -> bool { match (self, other) { - (SmallBitmap::Tiny(a), SmallBitmap::Tiny(b)) => op(*a, *b), - (SmallBitmap::Small(a), SmallBitmap::Small(b)) => { + (SmallBitmapInternal::Tiny(a), SmallBitmapInternal::Tiny(b)) => op(*a, *b), + (SmallBitmapInternal::Small(a), SmallBitmapInternal::Small(b)) => { assert!(a.len() == b.len()); for (a, b) in a.iter().zip(b.iter()) { if !op(*a, *b) { @@ -116,10 +196,14 @@ impl SmallBitmap { } } } - pub fn any_satisfy_op(&self, other: &SmallBitmap, op: impl Fn(u64, u64) -> bool) -> bool { + pub fn any_satisfy_op( + &self, + other: &SmallBitmapInternal, + op: impl Fn(u64, u64) -> bool, + ) -> bool { match (self, other) { - (SmallBitmap::Tiny(a), SmallBitmap::Tiny(b)) => op(*a, *b), - (SmallBitmap::Small(a), SmallBitmap::Small(b)) => { + (SmallBitmapInternal::Tiny(a), SmallBitmapInternal::Tiny(b)) => op(*a, *b), + (SmallBitmapInternal::Small(a), SmallBitmapInternal::Small(b)) => { assert!(a.len() == b.len()); for (a, b) in a.iter().zip(b.iter()) { if op(*a, *b) { @@ -133,32 +217,32 @@ impl SmallBitmap { } } } - pub fn is_subset(&self, other: &SmallBitmap) -> bool { + pub fn is_subset(&self, other: &SmallBitmapInternal) -> bool { self.all_satisfy_op(other, |a, b| a & !b == 0) } - pub fn intersects(&self, other: &SmallBitmap) -> bool { + pub fn intersects(&self, other: &SmallBitmapInternal) -> bool { self.any_satisfy_op(other, |a, b| a & b != 0) } - pub fn iter(&self) -> SmallBitmapIter<'_> { + pub fn iter(&self) -> SmallBitmapInternalIter<'_> { match self { - SmallBitmap::Tiny(x) => SmallBitmapIter::Tiny(*x), - SmallBitmap::Small(xs) => { - SmallBitmapIter::Small { cur: xs[0], next: &xs[1..], base: 0 } + SmallBitmapInternal::Tiny(x) => SmallBitmapInternalIter::Tiny(*x), + SmallBitmapInternal::Small(xs) => { + SmallBitmapInternalIter::Small { cur: xs[0], next: &xs[1..], base: 0 } } } } } -pub enum SmallBitmapIter<'b> { +pub enum SmallBitmapInternalIter<'b> { Tiny(u64), Small { cur: u64, next: &'b [u64], base: u16 }, } -impl<'b> Iterator for SmallBitmapIter<'b> { +impl<'b> Iterator for SmallBitmapInternalIter<'b> { type Item = u16; fn next(&mut self) -> Option { match self { - SmallBitmapIter::Tiny(set) => { + SmallBitmapInternalIter::Tiny(set) => { if *set > 0 { let idx = set.trailing_zeros() as u16; *set &= *set - 1; @@ -167,7 +251,7 @@ impl<'b> Iterator for SmallBitmapIter<'b> { None } } - SmallBitmapIter::Small { cur, next, base } => { + SmallBitmapInternalIter::Small { cur, next, base } => { if *cur > 0 { let idx = cur.trailing_zeros() as u16; *cur &= *cur - 1; @@ -185,23 +269,23 @@ impl<'b> Iterator for SmallBitmapIter<'b> { } } -#[cfg(test)] -mod tests { - use super::SmallBitmap; +// #[cfg(test)] +// mod tests { +// use super::SmallBitmap; - #[test] - fn test_small_bitmap() { - let mut bitmap1 = SmallBitmap::new(32); - for x in 0..16 { - bitmap1.insert(x * 2); - } - let mut bitmap2 = SmallBitmap::new(32); - for x in 0..=10 { - bitmap2.insert(x * 3); - } - bitmap1.intersection(&bitmap2); - for v in bitmap1.iter() { - println!("{v}"); - } - } -} +// #[test] +// fn test_small_bitmap() { +// let mut bitmap1 = SmallBitmap::new(32); +// for x in 0..16 { +// bitmap1.insert(x * 2); +// } +// let mut bitmap2 = SmallBitmap::new(32); +// for x in 0..=10 { +// bitmap2.insert(x * 3); +// } +// bitmap1.intersection(&bitmap2); +// for v in bitmap1.iter() { +// println!("{v}"); +// } +// } +// } diff --git a/milli/src/search/new/words.rs b/milli/src/search/new/words.rs index 5bc5ff1fe..f5f8c0895 100644 --- a/milli/src/search/new/words.rs +++ b/milli/src/search/new/words.rs @@ -3,8 +3,9 @@ use std::collections::BTreeSet; use roaring::RoaringBitmap; use super::logger::SearchLogger; +use super::query_graph::QueryNodeData; use super::resolve_query_graph::resolve_query_graph; -use super::{QueryGraph, QueryNode, RankingRule, RankingRuleOutput, SearchContext}; +use super::{QueryGraph, RankingRule, RankingRuleOutput, SearchContext}; use crate::{Result, TermsMatchingStrategy}; pub struct Words { @@ -43,12 +44,12 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for Words { let positions_to_remove = match self.terms_matching_strategy { TermsMatchingStrategy::Last => { let mut all_positions = BTreeSet::new(); - for n in parent_query_graph.nodes.iter() { - match n { - QueryNode::Term(term) => { + for (_, n) in parent_query_graph.nodes.iter() { + match &n.data { + QueryNodeData::Term(term) => { all_positions.extend(term.positions.clone().into_iter()); } - QueryNode::Deleted | QueryNode::Start | QueryNode::End => {} + QueryNodeData::Deleted | QueryNodeData::Start | QueryNodeData::End => {} } } let mut r: Vec = all_positions.into_iter().collect();