diff --git a/milli/src/search/new/db_cache.rs b/milli/src/search/new/db_cache.rs index cfd69b04f..8ebe14047 100644 --- a/milli/src/search/new/db_cache.rs +++ b/milli/src/search/new/db_cache.rs @@ -6,7 +6,7 @@ use std::collections::hash_map::Entry; #[derive(Default)] pub struct DatabaseCache<'search> { - // TODO: interner for all database cache keys + // TODO: interner for all database cache keys? pub word_pair_proximity_docids: FxHashMap<(u8, Interned, Interned), Option<&'search [u8]>>, pub word_prefix_pair_proximity_docids: diff --git a/milli/src/search/new/graph_based_ranking_rule.rs b/milli/src/search/new/graph_based_ranking_rule.rs index ac56b4f20..6c2e714ad 100644 --- a/milli/src/search/new/graph_based_ranking_rule.rs +++ b/milli/src/search/new/graph_based_ranking_rule.rs @@ -2,6 +2,7 @@ use super::logger::SearchLogger; use super::ranking_rule_graph::EdgeDocidsCache; use super::ranking_rule_graph::EmptyPathsCache; use super::ranking_rule_graph::{RankingRuleGraph, RankingRuleGraphTrait}; +use super::small_bitmap::SmallBitmap; use super::SearchContext; use super::{BitmapOrAllRef, QueryGraph, RankingRule, RankingRuleOutput}; use crate::Result; @@ -21,7 +22,7 @@ pub struct GraphBasedRankingRuleState { graph: RankingRuleGraph, edge_docids_cache: EdgeDocidsCache, empty_paths_cache: EmptyPathsCache, - all_distances: Vec>, + all_distances: Vec>, cur_distance_idx: usize, } @@ -65,7 +66,6 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph> universe: &RoaringBitmap, query_graph: &QueryGraph, ) -> Result<()> { - // TODO: update old state instead of starting from scratch let mut graph = RankingRuleGraph::build(ctx, query_graph.clone())?; let mut edge_docids_cache = EdgeDocidsCache::default(); let mut empty_paths_cache = EmptyPathsCache::new(graph.all_edges.len() as u16); @@ -77,7 +77,7 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph> universe, &mut empty_paths_cache, )?; - let all_distances = graph.initialize_distances_cheapest(); + let all_distances = graph.initialize_distances_with_necessary_edges(); let state = GraphBasedRankingRuleState { graph, @@ -100,6 +100,7 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph> ) -> Result>> { assert!(universe.len() > 1); let mut state = self.state.take().unwrap(); + remove_empty_edges( ctx, &mut state.graph, @@ -114,7 +115,7 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph> self.state = None; return Ok(None); } - let cost = + let (cost, _) = state.all_distances[state.graph.query_graph.root_node as usize][state.cur_distance_idx]; state.cur_distance_idx += 1; @@ -132,12 +133,15 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph> let original_universe = universe; let mut universe = universe.clone(); + // TODO: remove this unnecessary clone + let original_graph = graph.clone(); graph.visit_paths_of_cost( graph.query_graph.root_node as usize, cost, all_distances, empty_paths_cache, |path, graph, empty_paths_cache| { + paths.push(path.to_vec()); let mut path_docids = universe.clone(); let mut visited_edges = vec![]; let mut cached_edge_docids = vec![]; @@ -161,7 +165,7 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph> path_docids &= edge_docids; if path_docids.is_disjoint(&universe) { - empty_paths_cache.forbid_prefix(&visited_edges); + // empty_paths_cache.forbid_prefix(&visited_edges); // if the intersection between this edge and any // previous one is disjoint with the universe, // then we add these two edges to the empty_path_cache @@ -170,14 +174,12 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph> { let intersection = edge_docids & edge_docids2; if intersection.is_disjoint(&universe) { - // needs_filtering_empty_couple_edges = true; empty_paths_cache.forbid_couple_edges(*edge_index2, edge_index); } } return Ok(()); } } - paths.push(path.to_vec()); bucket |= &path_docids; universe -= path_docids; Ok(()) @@ -185,7 +187,7 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph> )?; G::log_state( - &state.graph, + &original_graph, &paths, &state.empty_paths_cache, original_universe, diff --git a/milli/src/search/new/logger/detailed.rs b/milli/src/search/new/logger/detailed.rs index 10b5e7097..47b3e2ea2 100644 --- a/milli/src/search/new/logger/detailed.rs +++ b/milli/src/search/new/logger/detailed.rs @@ -6,6 +6,7 @@ use std::time::Instant; use std::{io::Write, path::PathBuf}; use crate::new::ranking_rule_graph::TypoGraph; +use crate::new::small_bitmap::SmallBitmap; use crate::new::{QueryNode, QueryGraph, SearchContext}; use crate::new::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations}; use crate::new::ranking_rule_graph::EmptyPathsCache; @@ -45,7 +46,7 @@ pub enum SearchEvents { paths: Vec>, empty_paths_cache: EmptyPathsCache, universe: RoaringBitmap, - distances: Vec>, + distances: Vec>, cost: u16, }, TypoState { @@ -53,7 +54,7 @@ pub enum SearchEvents { paths: Vec>, empty_paths_cache: EmptyPathsCache, universe: RoaringBitmap, - distances: Vec>, + distances: Vec>, cost: u16, }, RankingRuleSkipBucket { ranking_rule_idx: usize, candidates: RoaringBitmap, time: Instant }, @@ -165,11 +166,11 @@ impl SearchLogger for DetailedSearchLogger { self.events.push(SearchEvents::WordsState { query_graph: query_graph.clone() }); } - fn log_proximity_state(&mut self, query_graph: &RankingRuleGraph, paths_map: &[Vec], empty_paths_cache: &EmptyPathsCache, universe: &RoaringBitmap, distances: Vec>, cost: u16,) { + fn log_proximity_state(&mut self, query_graph: &RankingRuleGraph, paths_map: &[Vec], empty_paths_cache: &EmptyPathsCache, universe: &RoaringBitmap, distances: Vec>, cost: u16,) { self.events.push(SearchEvents::ProximityState { graph: query_graph.clone(), paths: paths_map.to_vec(), empty_paths_cache: empty_paths_cache.clone(), universe: universe.clone(), distances, cost }) } - fn log_typo_state(&mut self, query_graph: &RankingRuleGraph, paths_map: &[Vec], empty_paths_cache: &EmptyPathsCache, universe: &RoaringBitmap, distances: Vec>, cost: u16,) { + fn log_typo_state(&mut self, query_graph: &RankingRuleGraph, paths_map: &[Vec], empty_paths_cache: &EmptyPathsCache, universe: &RoaringBitmap, distances: Vec>, cost: u16,) { self.events.push(SearchEvents::TypoState { graph: query_graph.clone(), paths: paths_map.to_vec(), empty_paths_cache: empty_paths_cache.clone(), universe: universe.clone(), distances, cost }) } @@ -352,7 +353,7 @@ results.{random} {{ writeln!(&mut file, "}}").unwrap(); } - fn query_node_d2_desc(ctx: &mut SearchContext, node_idx: usize, node: &QueryNode, _distances: &[u16], file: &mut File) { + fn query_node_d2_desc(ctx: &mut SearchContext, node_idx: usize, node: &QueryNode, distances: &[(u16, SmallBitmap)], file: &mut File) { match &node { QueryNode::Term(LocatedQueryTerm { value, .. }) => { match value { @@ -390,9 +391,9 @@ shape: class").unwrap(); if *use_prefix_db { writeln!(file, "use prefix DB : true").unwrap(); } - // for (i, d) in distances.iter().enumerate() { - // writeln!(file, "\"distances\" : {d}").unwrap(); - // } + for (d, edges) in distances.iter() { + writeln!(file, "\"distance {d}\" : {:?}", edges.iter().collect::>() ).unwrap(); + } writeln!(file, "}}").unwrap(); }, @@ -420,7 +421,7 @@ shape: class").unwrap(); } } } - fn ranking_rule_graph_d2_description(ctx: &mut SearchContext, graph: &RankingRuleGraph, paths: &[Vec], _empty_paths_cache: &EmptyPathsCache, distances: Vec>, file: &mut File) { + fn ranking_rule_graph_d2_description(ctx: &mut SearchContext, graph: &RankingRuleGraph, paths: &[Vec], _empty_paths_cache: &EmptyPathsCache, distances: Vec>, file: &mut File) { writeln!(file,"direction: right").unwrap(); writeln!(file, "Proximity Graph {{").unwrap(); @@ -477,7 +478,7 @@ shape: class").unwrap(); // } // writeln!(file, "}}").unwrap(); } - fn edge_d2_description(ctx: &mut SearchContext,graph: &RankingRuleGraph, edge_idx: u16, file: &mut File) { + fn edge_d2_description(ctx: &mut SearchContext, graph: &RankingRuleGraph, edge_idx: u16, file: &mut File) { let Edge { from_node, to_node, cost, .. } = graph.all_edges[edge_idx as usize].as_ref().unwrap() ; let from_node = &graph.query_graph.nodes[*from_node as usize]; let from_node_desc = match from_node { diff --git a/milli/src/search/new/logger/mod.rs b/milli/src/search/new/logger/mod.rs index bf78e4de0..8a10fd064 100644 --- a/milli/src/search/new/logger/mod.rs +++ b/milli/src/search/new/logger/mod.rs @@ -5,6 +5,7 @@ use roaring::RoaringBitmap; use super::{ ranking_rule_graph::{EmptyPathsCache, ProximityGraph, RankingRuleGraph, TypoGraph}, + small_bitmap::SmallBitmap, RankingRule, RankingRuleQueryTrait, }; @@ -61,7 +62,7 @@ impl SearchLogger for DefaultSearchLogger { _paths_map: &[Vec], _empty_paths_cache: &EmptyPathsCache, _universe: &RoaringBitmap, - _distances: Vec>, + _distances: Vec>, _cost: u16, ) { } @@ -72,7 +73,7 @@ impl SearchLogger for DefaultSearchLogger { _paths: &[Vec], _empty_paths_cache: &EmptyPathsCache, _universe: &RoaringBitmap, - _distances: Vec>, + _distances: Vec>, _cost: u16, ) { } @@ -123,7 +124,7 @@ pub trait SearchLogger { paths: &[Vec], empty_paths_cache: &EmptyPathsCache, universe: &RoaringBitmap, - distances: Vec>, + distances: Vec>, cost: u16, ); @@ -133,7 +134,7 @@ pub trait SearchLogger { paths: &[Vec], empty_paths_cache: &EmptyPathsCache, universe: &RoaringBitmap, - distances: Vec>, + distances: Vec>, cost: u16, ); } diff --git a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs index 14afd83d0..1adade945 100644 --- a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs +++ b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs @@ -4,7 +4,8 @@ use super::empty_paths_cache::EmptyPathsCache; use super::{RankingRuleGraph, RankingRuleGraphTrait}; use crate::new::small_bitmap::SmallBitmap; use crate::Result; -use std::collections::VecDeque; +use std::collections::btree_map::Entry; +use std::collections::{BTreeMap, VecDeque}; #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub struct Path { @@ -17,7 +18,7 @@ impl RankingRuleGraph { &mut self, from: usize, cost: u16, - all_distances: &[Vec], + all_distances: &[Vec<(u16, SmallBitmap)>], empty_paths_cache: &mut EmptyPathsCache, mut visit: impl FnMut(&[u16], &mut Self, &mut EmptyPathsCache) -> Result<()>, ) -> Result<()> { @@ -37,13 +38,9 @@ impl RankingRuleGraph { &mut self, from: usize, cost: u16, - // TODO: replace all_distances with a Vec where the SmallBitmap contains true if the cost exists and false otherwise - all_distances: &[Vec], + all_distances: &[Vec<(u16, SmallBitmap)>], empty_paths_cache: &mut EmptyPathsCache, visit: &mut impl FnMut(&[u16], &mut Self, &mut EmptyPathsCache) -> Result<()>, - // replace prev edges by: - // (1) a small bitmap representing the path - // (2) a pointer within the EmptyPathsCache::forbidden_prefixes structure prev_edges: &mut Vec, cur_path: &mut SmallBitmap, mut forbidden_edges: SmallBitmap, @@ -55,7 +52,12 @@ impl RankingRuleGraph { let Some(edge) = self.all_edges[edge_idx as usize].as_ref() else { continue }; if cost < edge.cost as u16 || forbidden_edges.contains(edge_idx) - || !all_distances[edge.to_node as usize].contains(&(cost - edge.cost as u16)) + || !all_distances[edge.to_node as usize].iter().any( + |(next_cost, necessary_edges)| { + (*next_cost == cost - edge.cost as u16) + && !forbidden_edges.intersects(necessary_edges) + }, + ) { continue; } @@ -99,21 +101,20 @@ impl RankingRuleGraph { forbidden_edges.insert(x); }); } - if next_any_valid && empty_paths_cache.path_is_empty(prev_edges, cur_path) { - return Ok(any_valid); - } } Ok(any_valid) } - pub fn initialize_distances_cheapest(&self) -> Vec> { - let mut distances_to_end: Vec> = vec![vec![]; self.query_graph.nodes.len()]; + pub fn initialize_distances_with_necessary_edges(&self) -> Vec> { + let mut distances_to_end: Vec> = + vec![vec![]; self.query_graph.nodes.len()]; let mut enqueued = SmallBitmap::new(self.query_graph.nodes.len() as u16); let mut node_stack = VecDeque::new(); - distances_to_end[self.query_graph.end_node as usize] = vec![0]; + distances_to_end[self.query_graph.end_node as usize] = + vec![(0, SmallBitmap::new(self.all_edges.len() as u16))]; for prev_node in self.query_graph.edges[self.query_graph.end_node as usize].predecessors.iter() @@ -123,21 +124,29 @@ impl RankingRuleGraph { } while let Some(cur_node) = node_stack.pop_front() { - let mut self_distances = vec![]; + let mut self_distances = BTreeMap::::new(); let cur_node_edges = &self.node_edges[cur_node]; for edge_idx in cur_node_edges.iter() { let edge = self.all_edges[edge_idx as usize].as_ref().unwrap(); let succ_node = edge.to_node; let succ_distances = &distances_to_end[succ_node as usize]; - for succ_distance in succ_distances { - self_distances.push(edge.cost as u16 + succ_distance); + for (succ_distance, succ_necessary_edges) in succ_distances { + let potential_necessary_edges = SmallBitmap::from_iter( + std::iter::once(edge_idx).chain(succ_necessary_edges.iter()), + self.all_edges.len() as u16, + ); + match self_distances.entry(edge.cost as u16 + succ_distance) { + Entry::Occupied(mut prev_necessary_edges) => { + prev_necessary_edges.get_mut().intersection(&potential_necessary_edges); + } + Entry::Vacant(entry) => { + entry.insert(potential_necessary_edges); + } + } } } - - self_distances.sort_unstable(); - self_distances.dedup(); - distances_to_end[cur_node] = self_distances; + distances_to_end[cur_node] = self_distances.into_iter().collect(); for prev_node in self.query_graph.edges[cur_node].predecessors.iter() { if !enqueued.contains(prev_node) { node_stack.push_back(prev_node as usize); diff --git a/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs b/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs index 13ee03a22..9823c4fcc 100644 --- a/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs +++ b/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs @@ -49,6 +49,9 @@ impl EdgeDocidsCache { if self.cache.contains_key(&edge_index) { // TODO: should we update the bitmap in the cache if the new universe // reduces it? + // TODO: maybe have a generation: u32 to track every time the universe was + // reduced. Then only attempt to recompute the intersection when there is a chance + // that edge_docids & universe changed return Ok(BitmapOrAllRef::Bitmap(&self.cache[&edge_index])); } // TODO: maybe universe doesn't belong here diff --git a/milli/src/search/new/ranking_rule_graph/mod.rs b/milli/src/search/new/ranking_rule_graph/mod.rs index 989986159..6d7445eac 100644 --- a/milli/src/search/new/ranking_rule_graph/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/mod.rs @@ -119,7 +119,7 @@ pub trait RankingRuleGraphTrait: Sized { paths: &[Vec], empty_paths_cache: &EmptyPathsCache, universe: &RoaringBitmap, - distances: &[Vec], + distances: &[Vec<(u16, SmallBitmap)>], cost: u16, logger: &mut dyn SearchLogger, ); diff --git a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs index 6c95b0805..7cc4f995f 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs @@ -6,6 +6,7 @@ use super::{EdgeDetails, RankingRuleGraphTrait}; use crate::new::interner::Interned; use crate::new::logger::SearchLogger; use crate::new::query_term::WordDerivations; +use crate::new::small_bitmap::SmallBitmap; use crate::new::{QueryGraph, QueryNode, SearchContext}; use crate::Result; use roaring::RoaringBitmap; @@ -64,7 +65,7 @@ impl RankingRuleGraphTrait for ProximityGraph { paths: &[Vec], empty_paths_cache: &EmptyPathsCache, universe: &RoaringBitmap, - distances: &[Vec], + distances: &[Vec<(u16, SmallBitmap)>], cost: u16, logger: &mut dyn SearchLogger, ) { diff --git a/milli/src/search/new/ranking_rule_graph/typo/mod.rs b/milli/src/search/new/ranking_rule_graph/typo/mod.rs index c510c4851..d3aec7174 100644 --- a/milli/src/search/new/ranking_rule_graph/typo/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/typo/mod.rs @@ -4,6 +4,7 @@ use crate::new::interner::Interned; use crate::new::logger::SearchLogger; use crate::new::query_term::{LocatedQueryTerm, Phrase, QueryTerm, WordDerivations}; use crate::new::resolve_query_graph::resolve_phrase; +use crate::new::small_bitmap::SmallBitmap; use crate::new::{QueryGraph, QueryNode, SearchContext}; use crate::{Result, RoaringBitmapCodec}; use heed::BytesDecode; @@ -123,7 +124,7 @@ impl RankingRuleGraphTrait for TypoGraph { paths: &[Vec], empty_paths_cache: &EmptyPathsCache, universe: &RoaringBitmap, - distances: &[Vec], + distances: &[Vec<(u16, SmallBitmap)>], cost: u16, logger: &mut dyn SearchLogger, ) { diff --git a/milli/src/search/new/ranking_rules.rs b/milli/src/search/new/ranking_rules.rs index b65ff6d1a..82216c9cf 100644 --- a/milli/src/search/new/ranking_rules.rs +++ b/milli/src/search/new/ranking_rules.rs @@ -262,44 +262,44 @@ mod tests { println!("nbr docids: {}", index.documents_ids(&txn).unwrap().len()); - loop { - let start = Instant::now(); + // loop { + let start = Instant::now(); - // let mut logger = crate::new::logger::detailed::DetailedSearchLogger::new("log"); - let mut ctx = SearchContext::new(&index, &txn); - let results = execute_search( - &mut ctx, - "which a the releases from poison by the government", - None, - 0, - 20, - &mut DefaultSearchLogger, - // &mut logger, - ) - .unwrap(); + // let mut logger = crate::new::logger::detailed::DetailedSearchLogger::new("log"); + let mut ctx = SearchContext::new(&index, &txn); + let results = execute_search( + &mut ctx, + "which a the releases from poison by the government", + None, + 0, + 20, + &mut DefaultSearchLogger, + // &mut logger, + ) + .unwrap(); - // logger.write_d2_description(&mut ctx); + // logger.write_d2_description(&mut ctx); - let elapsed = start.elapsed(); - println!("{}us", elapsed.as_micros()); + let elapsed = start.elapsed(); + println!("{}us", elapsed.as_micros()); - let _documents = index - .documents(&txn, results.iter().copied()) - .unwrap() - .into_iter() - .map(|(id, obkv)| { - let mut object = serde_json::Map::default(); - for (fid, fid_name) in index.fields_ids_map(&txn).unwrap().iter() { - let value = obkv.get(fid).unwrap(); - let value: serde_json::Value = serde_json::from_slice(value).unwrap(); - object.insert(fid_name.to_owned(), value); - } - (id, serde_json::to_string_pretty(&object).unwrap()) - }) - .collect::>(); + let _documents = index + .documents(&txn, results.iter().copied()) + .unwrap() + .into_iter() + .map(|(id, obkv)| { + let mut object = serde_json::Map::default(); + for (fid, fid_name) in index.fields_ids_map(&txn).unwrap().iter() { + let value = obkv.get(fid).unwrap(); + let value: serde_json::Value = serde_json::from_slice(value).unwrap(); + object.insert(fid_name.to_owned(), value); + } + (id, serde_json::to_string_pretty(&object).unwrap()) + }) + .collect::>(); - println!("{}us: {:?}", elapsed.as_micros(), results); - } + println!("{}us: {:?}", elapsed.as_micros(), results); + // } // for (id, _document) in documents { // println!("{id}:"); // // println!("{document}"); @@ -321,7 +321,7 @@ mod tests { let start = Instant::now(); let mut s = Search::new(&txn, &index); - s.query("releases from poison by the government"); + s.query("which a the releases from poison by the government"); s.terms_matching_strategy(TermsMatchingStrategy::Last); s.criterion_implementation_strategy(crate::CriterionImplementationStrategy::OnlySetBased); let docs = s.execute().unwrap(); @@ -362,7 +362,7 @@ mod tests { // loop { let start = Instant::now(); - // let mut logger = crate::new::logger::detailed::DetailedSearchLogger::new("log"); + let mut logger = crate::new::logger::detailed::DetailedSearchLogger::new("log"); let mut ctx = SearchContext::new(&index, &txn); let results = execute_search( &mut ctx, @@ -370,12 +370,12 @@ mod tests { None, 0, 20, - &mut DefaultSearchLogger, - // &mut logger, + // &mut DefaultSearchLogger, + &mut logger, ) .unwrap(); - // logger.write_d2_description(&mut ctx); + logger.write_d2_description(&mut ctx); let elapsed = start.elapsed(); @@ -414,7 +414,7 @@ mod tests { let start = Instant::now(); let mut s = Search::new(&txn, &index); - s.query("releases from poison by the government"); + s.query("which a the releases from poison by the government"); s.terms_matching_strategy(TermsMatchingStrategy::Last); s.criterion_implementation_strategy(crate::CriterionImplementationStrategy::OnlySetBased); let docs = s.execute().unwrap();