Add a few more optimisations to new search algorithms

This commit is contained in:
Loïc Lecrenier 2023-03-08 09:53:05 +01:00
parent 9051065c22
commit 10626dddfc
10 changed files with 104 additions and 86 deletions

View File

@ -6,7 +6,7 @@ use std::collections::hash_map::Entry;
#[derive(Default)]
pub struct DatabaseCache<'search> {
// TODO: interner for all database cache keys
// TODO: interner for all database cache keys?
pub word_pair_proximity_docids:
FxHashMap<(u8, Interned<String>, Interned<String>), Option<&'search [u8]>>,
pub word_prefix_pair_proximity_docids:

View File

@ -2,6 +2,7 @@ use super::logger::SearchLogger;
use super::ranking_rule_graph::EdgeDocidsCache;
use super::ranking_rule_graph::EmptyPathsCache;
use super::ranking_rule_graph::{RankingRuleGraph, RankingRuleGraphTrait};
use super::small_bitmap::SmallBitmap;
use super::SearchContext;
use super::{BitmapOrAllRef, QueryGraph, RankingRule, RankingRuleOutput};
use crate::Result;
@ -21,7 +22,7 @@ pub struct GraphBasedRankingRuleState<G: RankingRuleGraphTrait> {
graph: RankingRuleGraph<G>,
edge_docids_cache: EdgeDocidsCache<G>,
empty_paths_cache: EmptyPathsCache,
all_distances: Vec<Vec<u16>>,
all_distances: Vec<Vec<(u16, SmallBitmap)>>,
cur_distance_idx: usize,
}
@ -65,7 +66,6 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph>
universe: &RoaringBitmap,
query_graph: &QueryGraph,
) -> Result<()> {
// TODO: update old state instead of starting from scratch
let mut graph = RankingRuleGraph::build(ctx, query_graph.clone())?;
let mut edge_docids_cache = EdgeDocidsCache::default();
let mut empty_paths_cache = EmptyPathsCache::new(graph.all_edges.len() as u16);
@ -77,7 +77,7 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph>
universe,
&mut empty_paths_cache,
)?;
let all_distances = graph.initialize_distances_cheapest();
let all_distances = graph.initialize_distances_with_necessary_edges();
let state = GraphBasedRankingRuleState {
graph,
@ -100,6 +100,7 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph>
) -> Result<Option<RankingRuleOutput<QueryGraph>>> {
assert!(universe.len() > 1);
let mut state = self.state.take().unwrap();
remove_empty_edges(
ctx,
&mut state.graph,
@ -114,7 +115,7 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph>
self.state = None;
return Ok(None);
}
let cost =
let (cost, _) =
state.all_distances[state.graph.query_graph.root_node as usize][state.cur_distance_idx];
state.cur_distance_idx += 1;
@ -132,12 +133,15 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph>
let original_universe = universe;
let mut universe = universe.clone();
// TODO: remove this unnecessary clone
let original_graph = graph.clone();
graph.visit_paths_of_cost(
graph.query_graph.root_node as usize,
cost,
all_distances,
empty_paths_cache,
|path, graph, empty_paths_cache| {
paths.push(path.to_vec());
let mut path_docids = universe.clone();
let mut visited_edges = vec![];
let mut cached_edge_docids = vec![];
@ -161,7 +165,7 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph>
path_docids &= edge_docids;
if path_docids.is_disjoint(&universe) {
empty_paths_cache.forbid_prefix(&visited_edges);
// empty_paths_cache.forbid_prefix(&visited_edges);
// if the intersection between this edge and any
// previous one is disjoint with the universe,
// then we add these two edges to the empty_path_cache
@ -170,14 +174,12 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph>
{
let intersection = edge_docids & edge_docids2;
if intersection.is_disjoint(&universe) {
// needs_filtering_empty_couple_edges = true;
empty_paths_cache.forbid_couple_edges(*edge_index2, edge_index);
}
}
return Ok(());
}
}
paths.push(path.to_vec());
bucket |= &path_docids;
universe -= path_docids;
Ok(())
@ -185,7 +187,7 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph>
)?;
G::log_state(
&state.graph,
&original_graph,
&paths,
&state.empty_paths_cache,
original_universe,

View File

@ -6,6 +6,7 @@ use std::time::Instant;
use std::{io::Write, path::PathBuf};
use crate::new::ranking_rule_graph::TypoGraph;
use crate::new::small_bitmap::SmallBitmap;
use crate::new::{QueryNode, QueryGraph, SearchContext};
use crate::new::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations};
use crate::new::ranking_rule_graph::EmptyPathsCache;
@ -45,7 +46,7 @@ pub enum SearchEvents {
paths: Vec<Vec<u16>>,
empty_paths_cache: EmptyPathsCache,
universe: RoaringBitmap,
distances: Vec<Vec<u16>>,
distances: Vec<Vec<(u16, SmallBitmap)>>,
cost: u16,
},
TypoState {
@ -53,7 +54,7 @@ pub enum SearchEvents {
paths: Vec<Vec<u16>>,
empty_paths_cache: EmptyPathsCache,
universe: RoaringBitmap,
distances: Vec<Vec<u16>>,
distances: Vec<Vec<(u16, SmallBitmap)>>,
cost: u16,
},
RankingRuleSkipBucket { ranking_rule_idx: usize, candidates: RoaringBitmap, time: Instant },
@ -165,11 +166,11 @@ impl SearchLogger<QueryGraph> for DetailedSearchLogger {
self.events.push(SearchEvents::WordsState { query_graph: query_graph.clone() });
}
fn log_proximity_state(&mut self, query_graph: &RankingRuleGraph<ProximityGraph>, paths_map: &[Vec<u16>], empty_paths_cache: &EmptyPathsCache, universe: &RoaringBitmap, distances: Vec<Vec<u16>>, cost: u16,) {
fn log_proximity_state(&mut self, query_graph: &RankingRuleGraph<ProximityGraph>, paths_map: &[Vec<u16>], empty_paths_cache: &EmptyPathsCache, universe: &RoaringBitmap, distances: Vec<Vec<(u16, SmallBitmap)>>, cost: u16,) {
self.events.push(SearchEvents::ProximityState { graph: query_graph.clone(), paths: paths_map.to_vec(), empty_paths_cache: empty_paths_cache.clone(), universe: universe.clone(), distances, cost })
}
fn log_typo_state(&mut self, query_graph: &RankingRuleGraph<TypoGraph>, paths_map: &[Vec<u16>], empty_paths_cache: &EmptyPathsCache, universe: &RoaringBitmap, distances: Vec<Vec<u16>>, cost: u16,) {
fn log_typo_state(&mut self, query_graph: &RankingRuleGraph<TypoGraph>, paths_map: &[Vec<u16>], empty_paths_cache: &EmptyPathsCache, universe: &RoaringBitmap, distances: Vec<Vec<(u16, SmallBitmap)>>, cost: u16,) {
self.events.push(SearchEvents::TypoState { graph: query_graph.clone(), paths: paths_map.to_vec(), empty_paths_cache: empty_paths_cache.clone(), universe: universe.clone(), distances, cost })
}
@ -352,7 +353,7 @@ results.{random} {{
writeln!(&mut file, "}}").unwrap();
}
fn query_node_d2_desc(ctx: &mut SearchContext, node_idx: usize, node: &QueryNode, _distances: &[u16], file: &mut File) {
fn query_node_d2_desc(ctx: &mut SearchContext, node_idx: usize, node: &QueryNode, distances: &[(u16, SmallBitmap)], file: &mut File) {
match &node {
QueryNode::Term(LocatedQueryTerm { value, .. }) => {
match value {
@ -390,9 +391,9 @@ shape: class").unwrap();
if *use_prefix_db {
writeln!(file, "use prefix DB : true").unwrap();
}
// for (i, d) in distances.iter().enumerate() {
// writeln!(file, "\"distances\" : {d}").unwrap();
// }
for (d, edges) in distances.iter() {
writeln!(file, "\"distance {d}\" : {:?}", edges.iter().collect::<Vec<_>>() ).unwrap();
}
writeln!(file, "}}").unwrap();
},
@ -420,7 +421,7 @@ shape: class").unwrap();
}
}
}
fn ranking_rule_graph_d2_description<R: RankingRuleGraphTrait>(ctx: &mut SearchContext, graph: &RankingRuleGraph<R>, paths: &[Vec<u16>], _empty_paths_cache: &EmptyPathsCache, distances: Vec<Vec<u16>>, file: &mut File) {
fn ranking_rule_graph_d2_description<R: RankingRuleGraphTrait>(ctx: &mut SearchContext, graph: &RankingRuleGraph<R>, paths: &[Vec<u16>], _empty_paths_cache: &EmptyPathsCache, distances: Vec<Vec<(u16, SmallBitmap)>>, file: &mut File) {
writeln!(file,"direction: right").unwrap();
writeln!(file, "Proximity Graph {{").unwrap();
@ -477,7 +478,7 @@ shape: class").unwrap();
// }
// writeln!(file, "}}").unwrap();
}
fn edge_d2_description<R: RankingRuleGraphTrait>(ctx: &mut SearchContext,graph: &RankingRuleGraph<R>, edge_idx: u16, file: &mut File) {
fn edge_d2_description<R: RankingRuleGraphTrait>(ctx: &mut SearchContext, graph: &RankingRuleGraph<R>, edge_idx: u16, file: &mut File) {
let Edge { from_node, to_node, cost, .. } = graph.all_edges[edge_idx as usize].as_ref().unwrap() ;
let from_node = &graph.query_graph.nodes[*from_node as usize];
let from_node_desc = match from_node {

View File

@ -5,6 +5,7 @@ use roaring::RoaringBitmap;
use super::{
ranking_rule_graph::{EmptyPathsCache, ProximityGraph, RankingRuleGraph, TypoGraph},
small_bitmap::SmallBitmap,
RankingRule, RankingRuleQueryTrait,
};
@ -61,7 +62,7 @@ impl<Q: RankingRuleQueryTrait> SearchLogger<Q> for DefaultSearchLogger {
_paths_map: &[Vec<u16>],
_empty_paths_cache: &EmptyPathsCache,
_universe: &RoaringBitmap,
_distances: Vec<Vec<u16>>,
_distances: Vec<Vec<(u16, SmallBitmap)>>,
_cost: u16,
) {
}
@ -72,7 +73,7 @@ impl<Q: RankingRuleQueryTrait> SearchLogger<Q> for DefaultSearchLogger {
_paths: &[Vec<u16>],
_empty_paths_cache: &EmptyPathsCache,
_universe: &RoaringBitmap,
_distances: Vec<Vec<u16>>,
_distances: Vec<Vec<(u16, SmallBitmap)>>,
_cost: u16,
) {
}
@ -123,7 +124,7 @@ pub trait SearchLogger<Q: RankingRuleQueryTrait> {
paths: &[Vec<u16>],
empty_paths_cache: &EmptyPathsCache,
universe: &RoaringBitmap,
distances: Vec<Vec<u16>>,
distances: Vec<Vec<(u16, SmallBitmap)>>,
cost: u16,
);
@ -133,7 +134,7 @@ pub trait SearchLogger<Q: RankingRuleQueryTrait> {
paths: &[Vec<u16>],
empty_paths_cache: &EmptyPathsCache,
universe: &RoaringBitmap,
distances: Vec<Vec<u16>>,
distances: Vec<Vec<(u16, SmallBitmap)>>,
cost: u16,
);
}

View File

@ -4,7 +4,8 @@ use super::empty_paths_cache::EmptyPathsCache;
use super::{RankingRuleGraph, RankingRuleGraphTrait};
use crate::new::small_bitmap::SmallBitmap;
use crate::Result;
use std::collections::VecDeque;
use std::collections::btree_map::Entry;
use std::collections::{BTreeMap, VecDeque};
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct Path {
@ -17,7 +18,7 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
&mut self,
from: usize,
cost: u16,
all_distances: &[Vec<u16>],
all_distances: &[Vec<(u16, SmallBitmap)>],
empty_paths_cache: &mut EmptyPathsCache,
mut visit: impl FnMut(&[u16], &mut Self, &mut EmptyPathsCache) -> Result<()>,
) -> Result<()> {
@ -37,13 +38,9 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
&mut self,
from: usize,
cost: u16,
// TODO: replace all_distances with a Vec<SmallBitmap> where the SmallBitmap contains true if the cost exists and false otherwise
all_distances: &[Vec<u16>],
all_distances: &[Vec<(u16, SmallBitmap)>],
empty_paths_cache: &mut EmptyPathsCache,
visit: &mut impl FnMut(&[u16], &mut Self, &mut EmptyPathsCache) -> Result<()>,
// replace prev edges by:
// (1) a small bitmap representing the path
// (2) a pointer within the EmptyPathsCache::forbidden_prefixes structure
prev_edges: &mut Vec<u16>,
cur_path: &mut SmallBitmap,
mut forbidden_edges: SmallBitmap,
@ -55,7 +52,12 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
let Some(edge) = self.all_edges[edge_idx as usize].as_ref() else { continue };
if cost < edge.cost as u16
|| forbidden_edges.contains(edge_idx)
|| !all_distances[edge.to_node as usize].contains(&(cost - edge.cost as u16))
|| !all_distances[edge.to_node as usize].iter().any(
|(next_cost, necessary_edges)| {
(*next_cost == cost - edge.cost as u16)
&& !forbidden_edges.intersects(necessary_edges)
},
)
{
continue;
}
@ -99,21 +101,20 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
forbidden_edges.insert(x);
});
}
if next_any_valid && empty_paths_cache.path_is_empty(prev_edges, cur_path) {
return Ok(any_valid);
}
}
Ok(any_valid)
}
pub fn initialize_distances_cheapest(&self) -> Vec<Vec<u16>> {
let mut distances_to_end: Vec<Vec<u16>> = vec![vec![]; self.query_graph.nodes.len()];
pub fn initialize_distances_with_necessary_edges(&self) -> Vec<Vec<(u16, SmallBitmap)>> {
let mut distances_to_end: Vec<Vec<(u16, SmallBitmap)>> =
vec![vec![]; self.query_graph.nodes.len()];
let mut enqueued = SmallBitmap::new(self.query_graph.nodes.len() as u16);
let mut node_stack = VecDeque::new();
distances_to_end[self.query_graph.end_node as usize] = vec![0];
distances_to_end[self.query_graph.end_node as usize] =
vec![(0, SmallBitmap::new(self.all_edges.len() as u16))];
for prev_node in
self.query_graph.edges[self.query_graph.end_node as usize].predecessors.iter()
@ -123,21 +124,29 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
}
while let Some(cur_node) = node_stack.pop_front() {
let mut self_distances = vec![];
let mut self_distances = BTreeMap::<u16, SmallBitmap>::new();
let cur_node_edges = &self.node_edges[cur_node];
for edge_idx in cur_node_edges.iter() {
let edge = self.all_edges[edge_idx as usize].as_ref().unwrap();
let succ_node = edge.to_node;
let succ_distances = &distances_to_end[succ_node as usize];
for succ_distance in succ_distances {
self_distances.push(edge.cost as u16 + succ_distance);
for (succ_distance, succ_necessary_edges) in succ_distances {
let potential_necessary_edges = SmallBitmap::from_iter(
std::iter::once(edge_idx).chain(succ_necessary_edges.iter()),
self.all_edges.len() as u16,
);
match self_distances.entry(edge.cost as u16 + succ_distance) {
Entry::Occupied(mut prev_necessary_edges) => {
prev_necessary_edges.get_mut().intersection(&potential_necessary_edges);
}
Entry::Vacant(entry) => {
entry.insert(potential_necessary_edges);
}
}
}
}
self_distances.sort_unstable();
self_distances.dedup();
distances_to_end[cur_node] = self_distances;
distances_to_end[cur_node] = self_distances.into_iter().collect();
for prev_node in self.query_graph.edges[cur_node].predecessors.iter() {
if !enqueued.contains(prev_node) {
node_stack.push_back(prev_node as usize);

View File

@ -49,6 +49,9 @@ impl<G: RankingRuleGraphTrait> EdgeDocidsCache<G> {
if self.cache.contains_key(&edge_index) {
// TODO: should we update the bitmap in the cache if the new universe
// reduces it?
// TODO: maybe have a generation: u32 to track every time the universe was
// reduced. Then only attempt to recompute the intersection when there is a chance
// that edge_docids & universe changed
return Ok(BitmapOrAllRef::Bitmap(&self.cache[&edge_index]));
}
// TODO: maybe universe doesn't belong here

View File

@ -119,7 +119,7 @@ pub trait RankingRuleGraphTrait: Sized {
paths: &[Vec<u16>],
empty_paths_cache: &EmptyPathsCache,
universe: &RoaringBitmap,
distances: &[Vec<u16>],
distances: &[Vec<(u16, SmallBitmap)>],
cost: u16,
logger: &mut dyn SearchLogger<QueryGraph>,
);

View File

@ -6,6 +6,7 @@ use super::{EdgeDetails, RankingRuleGraphTrait};
use crate::new::interner::Interned;
use crate::new::logger::SearchLogger;
use crate::new::query_term::WordDerivations;
use crate::new::small_bitmap::SmallBitmap;
use crate::new::{QueryGraph, QueryNode, SearchContext};
use crate::Result;
use roaring::RoaringBitmap;
@ -64,7 +65,7 @@ impl RankingRuleGraphTrait for ProximityGraph {
paths: &[Vec<u16>],
empty_paths_cache: &EmptyPathsCache,
universe: &RoaringBitmap,
distances: &[Vec<u16>],
distances: &[Vec<(u16, SmallBitmap)>],
cost: u16,
logger: &mut dyn SearchLogger<QueryGraph>,
) {

View File

@ -4,6 +4,7 @@ use crate::new::interner::Interned;
use crate::new::logger::SearchLogger;
use crate::new::query_term::{LocatedQueryTerm, Phrase, QueryTerm, WordDerivations};
use crate::new::resolve_query_graph::resolve_phrase;
use crate::new::small_bitmap::SmallBitmap;
use crate::new::{QueryGraph, QueryNode, SearchContext};
use crate::{Result, RoaringBitmapCodec};
use heed::BytesDecode;
@ -123,7 +124,7 @@ impl RankingRuleGraphTrait for TypoGraph {
paths: &[Vec<u16>],
empty_paths_cache: &EmptyPathsCache,
universe: &RoaringBitmap,
distances: &[Vec<u16>],
distances: &[Vec<(u16, SmallBitmap)>],
cost: u16,
logger: &mut dyn SearchLogger<QueryGraph>,
) {

View File

@ -262,44 +262,44 @@ mod tests {
println!("nbr docids: {}", index.documents_ids(&txn).unwrap().len());
loop {
let start = Instant::now();
// loop {
let start = Instant::now();
// let mut logger = crate::new::logger::detailed::DetailedSearchLogger::new("log");
let mut ctx = SearchContext::new(&index, &txn);
let results = execute_search(
&mut ctx,
"which a the releases from poison by the government",
None,
0,
20,
&mut DefaultSearchLogger,
// &mut logger,
)
.unwrap();
// let mut logger = crate::new::logger::detailed::DetailedSearchLogger::new("log");
let mut ctx = SearchContext::new(&index, &txn);
let results = execute_search(
&mut ctx,
"which a the releases from poison by the government",
None,
0,
20,
&mut DefaultSearchLogger,
// &mut logger,
)
.unwrap();
// logger.write_d2_description(&mut ctx);
// logger.write_d2_description(&mut ctx);
let elapsed = start.elapsed();
println!("{}us", elapsed.as_micros());
let elapsed = start.elapsed();
println!("{}us", elapsed.as_micros());
let _documents = index
.documents(&txn, results.iter().copied())
.unwrap()
.into_iter()
.map(|(id, obkv)| {
let mut object = serde_json::Map::default();
for (fid, fid_name) in index.fields_ids_map(&txn).unwrap().iter() {
let value = obkv.get(fid).unwrap();
let value: serde_json::Value = serde_json::from_slice(value).unwrap();
object.insert(fid_name.to_owned(), value);
}
(id, serde_json::to_string_pretty(&object).unwrap())
})
.collect::<Vec<_>>();
let _documents = index
.documents(&txn, results.iter().copied())
.unwrap()
.into_iter()
.map(|(id, obkv)| {
let mut object = serde_json::Map::default();
for (fid, fid_name) in index.fields_ids_map(&txn).unwrap().iter() {
let value = obkv.get(fid).unwrap();
let value: serde_json::Value = serde_json::from_slice(value).unwrap();
object.insert(fid_name.to_owned(), value);
}
(id, serde_json::to_string_pretty(&object).unwrap())
})
.collect::<Vec<_>>();
println!("{}us: {:?}", elapsed.as_micros(), results);
}
println!("{}us: {:?}", elapsed.as_micros(), results);
// }
// for (id, _document) in documents {
// println!("{id}:");
// // println!("{document}");
@ -321,7 +321,7 @@ mod tests {
let start = Instant::now();
let mut s = Search::new(&txn, &index);
s.query("releases from poison by the government");
s.query("which a the releases from poison by the government");
s.terms_matching_strategy(TermsMatchingStrategy::Last);
s.criterion_implementation_strategy(crate::CriterionImplementationStrategy::OnlySetBased);
let docs = s.execute().unwrap();
@ -362,7 +362,7 @@ mod tests {
// loop {
let start = Instant::now();
// let mut logger = crate::new::logger::detailed::DetailedSearchLogger::new("log");
let mut logger = crate::new::logger::detailed::DetailedSearchLogger::new("log");
let mut ctx = SearchContext::new(&index, &txn);
let results = execute_search(
&mut ctx,
@ -370,12 +370,12 @@ mod tests {
None,
0,
20,
&mut DefaultSearchLogger,
// &mut logger,
// &mut DefaultSearchLogger,
&mut logger,
)
.unwrap();
// logger.write_d2_description(&mut ctx);
logger.write_d2_description(&mut ctx);
let elapsed = start.elapsed();
@ -414,7 +414,7 @@ mod tests {
let start = Instant::now();
let mut s = Search::new(&txn, &index);
s.query("releases from poison by the government");
s.query("which a the releases from poison by the government");
s.terms_matching_strategy(TermsMatchingStrategy::Last);
s.criterion_implementation_strategy(crate::CriterionImplementationStrategy::OnlySetBased);
let docs = s.execute().unwrap();