Reorganise initialisation of ranking rules + rename PathsMap -> PathSet

This commit is contained in:
Loïc Lecrenier 2023-03-13 09:52:17 +01:00
parent 9ec9c204d3
commit 5155fd2bf1
8 changed files with 186 additions and 47 deletions

View File

@ -40,12 +40,25 @@ use roaring::RoaringBitmap;
use super::logger::SearchLogger; use super::logger::SearchLogger;
use super::ranking_rule_graph::{ use super::ranking_rule_graph::{
EdgeDocidsCache, EmptyPathsCache, RankingRuleGraph, RankingRuleGraphTrait, EdgeDocidsCache, EmptyPathsCache, RankingRuleGraph, RankingRuleGraphTrait, TypoGraph, ProximityGraph,
}; };
use super::small_bitmap::SmallBitmap; use super::small_bitmap::SmallBitmap;
use super::{BitmapOrAllRef, QueryGraph, RankingRule, RankingRuleOutput, SearchContext}; use super::{BitmapOrAllRef, QueryGraph, RankingRule, RankingRuleOutput, SearchContext};
use crate::Result; use crate::Result;
pub type Proximity = GraphBasedRankingRule<ProximityGraph>;
impl Default for GraphBasedRankingRule<ProximityGraph> {
fn default() -> Self {
Self::new("proximity".to_owned())
}
}
pub type Typo = GraphBasedRankingRule<TypoGraph>;
impl Default for GraphBasedRankingRule<TypoGraph> {
fn default() -> Self {
Self::new("typo".to_owned())
}
}
/// A generic graph-based ranking rule /// A generic graph-based ranking rule
pub struct GraphBasedRankingRule<G: RankingRuleGraphTrait> { pub struct GraphBasedRankingRule<G: RankingRuleGraphTrait> {
id: String, id: String,

View File

@ -98,7 +98,7 @@ impl SearchLogger<QueryGraph> for DetailedSearchLogger {
fn initial_universe(&mut self, universe: &RoaringBitmap) { fn initial_universe(&mut self, universe: &RoaringBitmap) {
self.initial_universe = Some(universe.clone()); self.initial_universe = Some(universe.clone());
} }
fn ranking_rules(&mut self, rr: &[&mut dyn RankingRule<QueryGraph>]) { fn ranking_rules(&mut self, rr: &[Box<dyn RankingRule<QueryGraph>>]) {
self.ranking_rules_ids = Some(rr.iter().map(|rr| rr.id()).collect()); self.ranking_rules_ids = Some(rr.iter().map(|rr| rr.id()).collect());
} }

View File

@ -19,7 +19,7 @@ pub trait SearchLogger<Q: RankingRuleQueryTrait> {
fn initial_universe(&mut self, universe: &RoaringBitmap); fn initial_universe(&mut self, universe: &RoaringBitmap);
/// Logs the ranking rules used to perform the search query /// Logs the ranking rules used to perform the search query
fn ranking_rules(&mut self, rr: &[&mut dyn RankingRule<Q>]); fn ranking_rules(&mut self, rr: &[Box<dyn RankingRule<Q>>]);
/// Logs the start of a ranking rule's iteration. /// Logs the start of a ranking rule's iteration.
fn start_iteration_ranking_rule<'transaction>( fn start_iteration_ranking_rule<'transaction>(
@ -90,7 +90,7 @@ impl<Q: RankingRuleQueryTrait> SearchLogger<Q> for DefaultSearchLogger {
fn initial_universe(&mut self, _universe: &RoaringBitmap) {} fn initial_universe(&mut self, _universe: &RoaringBitmap) {}
fn ranking_rules(&mut self, _rr: &[&mut dyn RankingRule<Q>]) {} fn ranking_rules(&mut self, _rr: &[Box<dyn RankingRule<Q>>]) {}
fn start_iteration_ranking_rule<'transaction>( fn start_iteration_ranking_rule<'transaction>(
&mut self, &mut self,

View File

@ -17,7 +17,7 @@ mod words;
pub use logger::{DefaultSearchLogger, SearchLogger}; pub use logger::{DefaultSearchLogger, SearchLogger};
use std::collections::BTreeSet; use std::collections::{BTreeSet, HashSet};
use charabia::Tokenize; use charabia::Tokenize;
use db_cache::DatabaseCache; use db_cache::DatabaseCache;
@ -28,10 +28,10 @@ use roaring::RoaringBitmap;
use self::interner::Interner; use self::interner::Interner;
use self::query_term::{Phrase, WordDerivations}; use self::query_term::{Phrase, WordDerivations};
use self::ranking_rules::PlaceholderQuery;
use self::resolve_query_graph::{resolve_query_graph, QueryTermDocIdsCache}; use self::resolve_query_graph::{resolve_query_graph, QueryTermDocIdsCache};
use crate::search::new::graph_based_ranking_rule::GraphBasedRankingRule; use crate::search::new::graph_based_ranking_rule::{Proximity, Typo};
use crate::search::new::query_term::located_query_terms_from_string; use crate::search::new::query_term::located_query_terms_from_string;
use crate::search::new::ranking_rule_graph::{ProximityGraph, TypoGraph};
use crate::search::new::words::Words; use crate::search::new::words::Words;
use crate::{Filter, Index, Result, TermsMatchingStrategy}; use crate::{Filter, Index, Result, TermsMatchingStrategy};
@ -88,7 +88,9 @@ fn resolve_maximally_reduced_query_graph<'search>(
TermsMatchingStrategy::All => vec![], TermsMatchingStrategy::All => vec![],
}; };
// don't remove the first term // don't remove the first term
if !positions_to_remove.is_empty() {
positions_to_remove.remove(0); positions_to_remove.remove(0);
}
loop { loop {
if positions_to_remove.is_empty() { if positions_to_remove.is_empty() {
break; break;
@ -102,48 +104,172 @@ fn resolve_maximally_reduced_query_graph<'search>(
Ok(docids) Ok(docids)
} }
fn get_ranking_rules_for_placeholder_search<'search>(
ctx: &SearchContext<'search>,
) -> Result<Vec<Box<dyn RankingRule<'search, PlaceholderQuery>>>> {
// let sort = false;
// let mut asc = HashSet::new();
// let mut desc = HashSet::new();
let /*mut*/ ranking_rules: Vec<Box<dyn RankingRule<PlaceholderQuery>>> = vec![];
let settings_ranking_rules = ctx.index.criteria(ctx.txn)?;
for rr in settings_ranking_rules {
// Add Words before any of: typo, proximity, attribute, exactness
match rr {
crate::Criterion::Words
| crate::Criterion::Typo
| crate::Criterion::Attribute
| crate::Criterion::Proximity
| crate::Criterion::Exactness => continue,
crate::Criterion::Sort => todo!(),
crate::Criterion::Asc(_) => todo!(),
crate::Criterion::Desc(_) => todo!(),
}
}
Ok(ranking_rules)
}
fn get_ranking_rules_for_query_graph_search<'search>(
ctx: &SearchContext<'search>,
terms_matching_strategy: TermsMatchingStrategy,
) -> Result<Vec<Box<dyn RankingRule<'search, QueryGraph>>>> {
// query graph search
let mut words = false;
let mut typo = false;
let mut proximity = false;
let sort = false;
let attribute = false;
let exactness = false;
let mut asc = HashSet::new();
let mut desc = HashSet::new();
let mut ranking_rules: Vec<Box<dyn RankingRule<QueryGraph>>> = vec![];
let settings_ranking_rules = ctx.index.criteria(ctx.txn)?;
for rr in settings_ranking_rules {
// Add Words before any of: typo, proximity, attribute, exactness
match rr {
crate::Criterion::Typo
| crate::Criterion::Attribute
| crate::Criterion::Proximity
| crate::Criterion::Exactness => {
if !words {
ranking_rules.push(Box::new(Words::new(terms_matching_strategy)));
words = true;
}
}
_ => {}
}
match rr {
crate::Criterion::Words => {
if words {
continue;
}
ranking_rules.push(Box::new(Words::new(terms_matching_strategy)));
words = true;
}
crate::Criterion::Typo => {
if typo {
continue;
}
typo = true;
ranking_rules.push(Box::<Typo>::default());
}
crate::Criterion::Proximity => {
if proximity {
continue;
}
proximity = true;
ranking_rules.push(Box::<Proximity>::default());
}
crate::Criterion::Attribute => {
if attribute {
continue;
}
todo!();
// attribute = false;
}
crate::Criterion::Sort => {
if sort {
continue;
}
todo!();
// sort = false;
}
crate::Criterion::Exactness => {
if exactness {
continue;
}
todo!();
// exactness = false;
}
crate::Criterion::Asc(field) => {
if asc.contains(&field) {
continue;
}
asc.insert(field);
todo!();
}
crate::Criterion::Desc(field) => {
if desc.contains(&field) {
continue;
}
desc.insert(field);
todo!();
}
}
}
Ok(ranking_rules)
}
#[allow(clippy::too_many_arguments)] #[allow(clippy::too_many_arguments)]
pub fn execute_search<'search>( pub fn execute_search<'search>(
ctx: &mut SearchContext<'search>, ctx: &mut SearchContext<'search>,
query: &str, query: &str,
terms_matching_strategy: TermsMatchingStrategy,
filters: Option<Filter>, filters: Option<Filter>,
from: usize, from: usize,
length: usize, length: usize,
logger: &mut dyn SearchLogger<QueryGraph>, placeholder_search_logger: &mut dyn SearchLogger<PlaceholderQuery>,
query_graph_logger: &mut dyn SearchLogger<QueryGraph>,
) -> Result<Vec<u32>> { ) -> Result<Vec<u32>> {
assert!(!query.is_empty()); assert!(!query.is_empty());
let query_terms = located_query_terms_from_string(ctx, query.tokenize(), None)?; let query_terms = located_query_terms_from_string(ctx, query.tokenize(), None)?;
let graph = QueryGraph::from_query(ctx, query_terms)?; let graph = QueryGraph::from_query(ctx, query_terms)?;
logger.initial_query(&graph);
let universe = if let Some(filters) = filters { let universe = if let Some(filters) = filters {
filters.evaluate(ctx.txn, ctx.index)? filters.evaluate(ctx.txn, ctx.index)?
} else { } else {
ctx.index.documents_ids(ctx.txn)? ctx.index.documents_ids(ctx.txn)?
}; };
// TODO: other way to tell whether it is a placeholder search
// This way of doing things is not correct because if someone searches
// for a word that does not appear in any document, the word will be removed
// from the graph and thus its number of nodes will be == 2
// But in that case, we should return no results.
//
// The search is a placeholder search only if there are no tokens?
if graph.nodes.len() > 2 {
let universe = resolve_maximally_reduced_query_graph( let universe = resolve_maximally_reduced_query_graph(
ctx, ctx,
&universe, &universe,
&graph, &graph,
TermsMatchingStrategy::Last, terms_matching_strategy,
logger, query_graph_logger,
)?; )?;
// TODO: create ranking rules here
logger.initial_universe(&universe); let ranking_rules = get_ranking_rules_for_query_graph_search(ctx, terms_matching_strategy)?;
bucket_sort(ctx, ranking_rules, &graph, &universe, from, length, query_graph_logger)
let words = &mut Words::new(TermsMatchingStrategy::Last); } else {
// let sort = &mut Sort::new(index, txn, "release_date".to_owned(), true)?; let ranking_rules = get_ranking_rules_for_placeholder_search(ctx)?;
let proximity = &mut GraphBasedRankingRule::<ProximityGraph>::new("proximity".to_owned()); bucket_sort(
let typo = &mut GraphBasedRankingRule::<TypoGraph>::new("typo".to_owned()); ctx,
// TODO: ranking rules given as argument ranking_rules,
let ranking_rules: Vec<&mut dyn RankingRule<'search, QueryGraph>> = &PlaceholderQuery,
vec![words, typo, proximity /*sort*/]; &universe,
from,
bucket_sort(ctx, ranking_rules, &graph, &universe, from, length, logger) length,
placeholder_search_logger,
)
}
} }
#[cfg(test)] #[cfg(test)]
@ -182,10 +308,11 @@ mod tests {
let results = execute_search( let results = execute_search(
&mut ctx, &mut ctx,
"zero config", "zero config",
TermsMatchingStrategy::Last,
None, None,
0, 0,
20, 20,
// &mut DefaultSearchLogger, &mut DefaultSearchLogger,
&mut logger, &mut logger,
) )
.unwrap(); .unwrap();
@ -279,10 +406,11 @@ mod tests {
let results = execute_search( let results = execute_search(
&mut ctx, &mut ctx,
"releases from poison by the government", "releases from poison by the government",
TermsMatchingStrategy::Last,
None, None,
0, 0,
20, 20,
// &mut DefaultSearchLogger, &mut DefaultSearchLogger,
&mut logger, &mut logger,
) )
.unwrap(); .unwrap();

View File

@ -1,4 +1,4 @@
use super::paths_map::PathSet; use super::path_set::PathSet;
use crate::search::new::small_bitmap::SmallBitmap; use crate::search::new::small_bitmap::SmallBitmap;
/// A cache which stores sufficient conditions for a path /// A cache which stores sufficient conditions for a path
@ -10,7 +10,7 @@ pub struct EmptyPathsCache {
pub empty_edges: SmallBitmap, pub empty_edges: SmallBitmap,
/// A set of path prefixes that resolve to no documents. /// A set of path prefixes that resolve to no documents.
pub empty_prefixes: PathSet, pub empty_prefixes: PathSet,
/// A set of empty couple of edge indexes that resolve to no documents. /// A set of empty couples of edge indexes that resolve to no documents.
pub empty_couple_edges: Vec<SmallBitmap>, pub empty_couple_edges: Vec<SmallBitmap>,
} }
impl EmptyPathsCache { impl EmptyPathsCache {

View File

@ -9,7 +9,7 @@ mod build;
mod cheapest_paths; mod cheapest_paths;
mod edge_docids_cache; mod edge_docids_cache;
mod empty_paths_cache; mod empty_paths_cache;
mod paths_map; mod path_set;
/// Implementation of the `proximity` ranking rule /// Implementation of the `proximity` ranking rule
mod proximity; mod proximity;

View File

@ -70,16 +70,15 @@ pub struct RankingRuleOutput<Q> {
pub fn bucket_sort<'search, Q: RankingRuleQueryTrait>( pub fn bucket_sort<'search, Q: RankingRuleQueryTrait>(
ctx: &mut SearchContext<'search>, ctx: &mut SearchContext<'search>,
mut ranking_rules: Vec<&mut dyn RankingRule<'search, Q>>, mut ranking_rules: Vec<Box<dyn RankingRule<'search, Q>>>,
query_graph: &Q, query: &Q,
universe: &RoaringBitmap, universe: &RoaringBitmap,
from: usize, from: usize,
length: usize, length: usize,
logger: &mut dyn SearchLogger<Q>, logger: &mut dyn SearchLogger<Q>,
) -> Result<Vec<u32>> { ) -> Result<Vec<u32>> {
logger.initial_query(query_graph);
logger.ranking_rules(&ranking_rules); logger.ranking_rules(&ranking_rules);
logger.initial_universe(universe);
let distinct_fid = if let Some(field) = ctx.index.distinct_field(ctx.txn)? { let distinct_fid = if let Some(field) = ctx.index.distinct_field(ctx.txn)? {
ctx.index.fields_ids_map(ctx.txn)?.id(field) ctx.index.fields_ids_map(ctx.txn)?.id(field)
@ -92,8 +91,8 @@ pub fn bucket_sort<'search, Q: RankingRuleQueryTrait>(
} }
let ranking_rules_len = ranking_rules.len(); let ranking_rules_len = ranking_rules.len();
logger.start_iteration_ranking_rule(0, ranking_rules[0], query_graph, universe); logger.start_iteration_ranking_rule(0, ranking_rules[0].as_ref(), query, universe);
ranking_rules[0].start_iteration(ctx, logger, universe, query_graph)?; ranking_rules[0].start_iteration(ctx, logger, universe, query)?;
let mut ranking_rule_universes: Vec<RoaringBitmap> = let mut ranking_rule_universes: Vec<RoaringBitmap> =
vec![RoaringBitmap::default(); ranking_rules_len]; vec![RoaringBitmap::default(); ranking_rules_len];
@ -109,7 +108,7 @@ pub fn bucket_sort<'search, Q: RankingRuleQueryTrait>(
assert!(ranking_rule_universes[cur_ranking_rule_index].is_empty()); assert!(ranking_rule_universes[cur_ranking_rule_index].is_empty());
logger.end_iteration_ranking_rule( logger.end_iteration_ranking_rule(
cur_ranking_rule_index, cur_ranking_rule_index,
ranking_rules[cur_ranking_rule_index], ranking_rules[cur_ranking_rule_index].as_ref(),
&ranking_rule_universes[cur_ranking_rule_index], &ranking_rule_universes[cur_ranking_rule_index],
); );
ranking_rule_universes[cur_ranking_rule_index].clear(); ranking_rule_universes[cur_ranking_rule_index].clear();
@ -149,7 +148,7 @@ pub fn bucket_sort<'search, Q: RankingRuleQueryTrait>(
// then just skip the bucket // then just skip the bucket
logger.skip_bucket_ranking_rule( logger.skip_bucket_ranking_rule(
cur_ranking_rule_index, cur_ranking_rule_index,
ranking_rules[cur_ranking_rule_index], ranking_rules[cur_ranking_rule_index].as_ref(),
&candidates, &candidates,
); );
} else { } else {
@ -159,7 +158,7 @@ pub fn bucket_sort<'search, Q: RankingRuleQueryTrait>(
all_candidates.split_at(from - cur_offset); all_candidates.split_at(from - cur_offset);
logger.skip_bucket_ranking_rule( logger.skip_bucket_ranking_rule(
cur_ranking_rule_index, cur_ranking_rule_index,
ranking_rules[cur_ranking_rule_index], ranking_rules[cur_ranking_rule_index].as_ref(),
&skipped_candidates.into_iter().collect(), &skipped_candidates.into_iter().collect(),
); );
let candidates = candidates let candidates = candidates
@ -186,7 +185,6 @@ pub fn bucket_sort<'search, Q: RankingRuleQueryTrait>(
// anything, just extend the results and go back to the parent ranking rule. // anything, just extend the results and go back to the parent ranking rule.
if ranking_rule_universes[cur_ranking_rule_index].len() <= 1 { if ranking_rule_universes[cur_ranking_rule_index].len() <= 1 {
maybe_add_to_results!(&ranking_rule_universes[cur_ranking_rule_index]); maybe_add_to_results!(&ranking_rule_universes[cur_ranking_rule_index]);
ranking_rule_universes[cur_ranking_rule_index].clear();
back!(); back!();
continue; continue;
} }
@ -198,7 +196,7 @@ pub fn bucket_sort<'search, Q: RankingRuleQueryTrait>(
logger.next_bucket_ranking_rule( logger.next_bucket_ranking_rule(
cur_ranking_rule_index, cur_ranking_rule_index,
ranking_rules[cur_ranking_rule_index], ranking_rules[cur_ranking_rule_index].as_ref(),
&ranking_rule_universes[cur_ranking_rule_index], &ranking_rule_universes[cur_ranking_rule_index],
&next_bucket.candidates, &next_bucket.candidates,
); );
@ -218,7 +216,7 @@ pub fn bucket_sort<'search, Q: RankingRuleQueryTrait>(
ranking_rule_universes[cur_ranking_rule_index] = next_bucket.candidates.clone(); ranking_rule_universes[cur_ranking_rule_index] = next_bucket.candidates.clone();
logger.start_iteration_ranking_rule( logger.start_iteration_ranking_rule(
cur_ranking_rule_index, cur_ranking_rule_index,
ranking_rules[cur_ranking_rule_index], ranking_rules[cur_ranking_rule_index].as_ref(),
&next_bucket.query, &next_bucket.query,
&ranking_rule_universes[cur_ranking_rule_index], &ranking_rule_universes[cur_ranking_rule_index],
); );