2023-03-06 03:07:37 +08:00
|
|
|
mod db_cache;
|
2023-03-09 22:20:29 +08:00
|
|
|
mod distinct;
|
2023-03-06 03:07:37 +08:00
|
|
|
mod graph_based_ranking_rule;
|
2023-03-07 02:21:55 +08:00
|
|
|
mod interner;
|
2023-03-06 03:07:37 +08:00
|
|
|
mod logger;
|
|
|
|
mod query_graph;
|
|
|
|
mod query_term;
|
|
|
|
mod ranking_rule_graph;
|
|
|
|
mod ranking_rules;
|
|
|
|
mod resolve_query_graph;
|
2023-03-09 18:12:31 +08:00
|
|
|
// TODO: documentation + comments
|
2023-03-07 21:42:58 +08:00
|
|
|
mod small_bitmap;
|
2023-03-09 18:12:31 +08:00
|
|
|
// TODO: documentation + comments
|
2023-03-27 17:04:04 +08:00
|
|
|
// implementation is currently an adaptation of the previous implementation to fit with the new model
|
2023-03-06 03:07:37 +08:00
|
|
|
mod sort;
|
2023-03-09 18:12:31 +08:00
|
|
|
// TODO: documentation + comments
|
2023-03-06 03:07:37 +08:00
|
|
|
mod words;
|
2023-02-21 16:41:58 +08:00
|
|
|
|
2023-03-13 16:52:17 +08:00
|
|
|
use std::collections::{BTreeSet, HashSet};
|
2023-03-08 16:55:53 +08:00
|
|
|
|
2023-03-23 16:39:16 +08:00
|
|
|
use charabia::TokenizerBuilder;
|
2023-03-06 15:35:01 +08:00
|
|
|
use db_cache::DatabaseCache;
|
2023-03-18 22:04:34 +08:00
|
|
|
use graph_based_ranking_rule::{Proximity, Typo};
|
2023-03-06 15:35:01 +08:00
|
|
|
use heed::RoTxn;
|
2023-03-18 22:04:34 +08:00
|
|
|
use interner::DedupInterner;
|
2023-03-19 22:15:58 +08:00
|
|
|
pub use logger::detailed::DetailedSearchLogger;
|
|
|
|
pub use logger::{DefaultSearchLogger, SearchLogger};
|
2023-03-18 22:04:34 +08:00
|
|
|
use query_graph::{QueryGraph, QueryNode, QueryNodeData};
|
|
|
|
use query_term::{located_query_terms_from_string, Phrase, QueryTerm};
|
|
|
|
use ranking_rules::{bucket_sort, PlaceholderQuery, RankingRuleOutput, RankingRuleQueryTrait};
|
|
|
|
use resolve_query_graph::{resolve_query_graph, QueryTermDocIdsCache};
|
2023-03-07 21:42:58 +08:00
|
|
|
use roaring::RoaringBitmap;
|
2023-03-18 22:04:34 +08:00
|
|
|
use words::Words;
|
2023-03-08 16:55:53 +08:00
|
|
|
|
2023-03-18 22:04:34 +08:00
|
|
|
use self::ranking_rules::RankingRule;
|
2023-03-27 17:04:27 +08:00
|
|
|
use crate::{Filter, Index, MatchingWords, Result, SearchResult, TermsMatchingStrategy};
|
2023-02-21 16:41:58 +08:00
|
|
|
|
2023-03-16 18:52:51 +08:00
|
|
|
/// A structure used throughout the execution of a search query.
|
2023-03-13 21:03:48 +08:00
|
|
|
pub struct SearchContext<'ctx> {
|
|
|
|
pub index: &'ctx Index,
|
|
|
|
pub txn: &'ctx RoTxn<'ctx>,
|
|
|
|
pub db_cache: DatabaseCache<'ctx>,
|
2023-03-14 23:37:47 +08:00
|
|
|
pub word_interner: DedupInterner<String>,
|
|
|
|
pub phrase_interner: DedupInterner<Phrase>,
|
|
|
|
pub term_interner: DedupInterner<QueryTerm>,
|
2023-03-27 17:04:04 +08:00
|
|
|
// think about memory usage of that field (roaring bitmaps in a hashmap)
|
2023-03-14 17:54:55 +08:00
|
|
|
pub term_docids: QueryTermDocIdsCache,
|
2023-03-07 02:21:55 +08:00
|
|
|
}
|
2023-03-13 21:03:48 +08:00
|
|
|
impl<'ctx> SearchContext<'ctx> {
|
|
|
|
pub fn new(index: &'ctx Index, txn: &'ctx RoTxn<'ctx>) -> Self {
|
2023-03-07 02:21:55 +08:00
|
|
|
Self {
|
|
|
|
index,
|
|
|
|
txn,
|
|
|
|
db_cache: <_>::default(),
|
|
|
|
word_interner: <_>::default(),
|
|
|
|
phrase_interner: <_>::default(),
|
2023-03-14 17:54:55 +08:00
|
|
|
term_interner: <_>::default(),
|
|
|
|
term_docids: <_>::default(),
|
2023-03-07 02:21:55 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-03-16 18:52:51 +08:00
|
|
|
/// Apply the [`TermsMatchingStrategy`] to the query graph and resolve it.
|
2023-03-06 15:35:01 +08:00
|
|
|
#[allow(clippy::too_many_arguments)]
|
2023-03-23 16:15:57 +08:00
|
|
|
fn resolve_maximally_reduced_query_graph(
|
|
|
|
ctx: &mut SearchContext,
|
2023-03-06 15:35:01 +08:00
|
|
|
universe: &RoaringBitmap,
|
|
|
|
query_graph: &QueryGraph,
|
|
|
|
matching_strategy: TermsMatchingStrategy,
|
|
|
|
logger: &mut dyn SearchLogger<QueryGraph>,
|
|
|
|
) -> Result<RoaringBitmap> {
|
|
|
|
let mut graph = query_graph.clone();
|
|
|
|
let mut positions_to_remove = match matching_strategy {
|
|
|
|
TermsMatchingStrategy::Last => {
|
|
|
|
let mut all_positions = BTreeSet::new();
|
2023-03-14 23:37:47 +08:00
|
|
|
for (_, n) in query_graph.nodes.iter() {
|
|
|
|
match &n.data {
|
|
|
|
QueryNodeData::Term(term) => {
|
2023-03-23 16:15:57 +08:00
|
|
|
all_positions.extend(term.positions.clone());
|
2023-03-06 15:35:01 +08:00
|
|
|
}
|
2023-03-14 23:37:47 +08:00
|
|
|
QueryNodeData::Deleted | QueryNodeData::Start | QueryNodeData::End => {}
|
2023-03-06 15:35:01 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
all_positions.into_iter().collect()
|
|
|
|
}
|
|
|
|
TermsMatchingStrategy::All => vec![],
|
|
|
|
};
|
|
|
|
// don't remove the first term
|
2023-03-13 16:52:17 +08:00
|
|
|
if !positions_to_remove.is_empty() {
|
|
|
|
positions_to_remove.remove(0);
|
|
|
|
}
|
2023-03-06 15:35:01 +08:00
|
|
|
loop {
|
|
|
|
if positions_to_remove.is_empty() {
|
|
|
|
break;
|
|
|
|
} else {
|
|
|
|
let position_to_remove = positions_to_remove.pop().unwrap();
|
2023-03-08 20:26:29 +08:00
|
|
|
let _ = graph.remove_words_starting_at_position(position_to_remove);
|
2023-03-06 15:35:01 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
logger.query_for_universe(&graph);
|
2023-03-07 02:21:55 +08:00
|
|
|
let docids = resolve_query_graph(ctx, &graph, universe)?;
|
2023-03-06 15:35:01 +08:00
|
|
|
|
|
|
|
Ok(docids)
|
|
|
|
}
|
2023-03-16 18:52:51 +08:00
|
|
|
|
|
|
|
/// Return the list of initialised ranking rules to be used for a placeholder search.
|
2023-03-13 21:03:48 +08:00
|
|
|
fn get_ranking_rules_for_placeholder_search<'ctx>(
|
|
|
|
ctx: &SearchContext<'ctx>,
|
|
|
|
) -> Result<Vec<Box<dyn RankingRule<'ctx, PlaceholderQuery>>>> {
|
2023-03-13 16:52:17 +08:00
|
|
|
// let sort = false;
|
|
|
|
// let mut asc = HashSet::new();
|
|
|
|
// let mut desc = HashSet::new();
|
|
|
|
let /*mut*/ ranking_rules: Vec<Box<dyn RankingRule<PlaceholderQuery>>> = vec![];
|
|
|
|
let settings_ranking_rules = ctx.index.criteria(ctx.txn)?;
|
|
|
|
for rr in settings_ranking_rules {
|
|
|
|
// Add Words before any of: typo, proximity, attribute, exactness
|
|
|
|
match rr {
|
|
|
|
crate::Criterion::Words
|
|
|
|
| crate::Criterion::Typo
|
|
|
|
| crate::Criterion::Attribute
|
|
|
|
| crate::Criterion::Proximity
|
|
|
|
| crate::Criterion::Exactness => continue,
|
|
|
|
crate::Criterion::Sort => todo!(),
|
|
|
|
crate::Criterion::Asc(_) => todo!(),
|
|
|
|
crate::Criterion::Desc(_) => todo!(),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
Ok(ranking_rules)
|
|
|
|
}
|
2023-03-16 18:52:51 +08:00
|
|
|
|
|
|
|
/// Return the list of initialised ranking rules to be used for a query graph search.
|
2023-03-13 21:03:48 +08:00
|
|
|
fn get_ranking_rules_for_query_graph_search<'ctx>(
|
|
|
|
ctx: &SearchContext<'ctx>,
|
2023-03-13 16:52:17 +08:00
|
|
|
terms_matching_strategy: TermsMatchingStrategy,
|
2023-03-13 21:03:48 +08:00
|
|
|
) -> Result<Vec<Box<dyn RankingRule<'ctx, QueryGraph>>>> {
|
2023-03-13 16:52:17 +08:00
|
|
|
// query graph search
|
|
|
|
let mut words = false;
|
|
|
|
let mut typo = false;
|
|
|
|
let mut proximity = false;
|
|
|
|
let sort = false;
|
|
|
|
let attribute = false;
|
|
|
|
let exactness = false;
|
|
|
|
let mut asc = HashSet::new();
|
|
|
|
let mut desc = HashSet::new();
|
|
|
|
|
|
|
|
let mut ranking_rules: Vec<Box<dyn RankingRule<QueryGraph>>> = vec![];
|
|
|
|
let settings_ranking_rules = ctx.index.criteria(ctx.txn)?;
|
|
|
|
for rr in settings_ranking_rules {
|
|
|
|
// Add Words before any of: typo, proximity, attribute, exactness
|
|
|
|
match rr {
|
|
|
|
crate::Criterion::Typo
|
|
|
|
| crate::Criterion::Attribute
|
|
|
|
| crate::Criterion::Proximity
|
2023-03-22 21:43:08 +08:00
|
|
|
// TODO: no exactness
|
2023-03-13 16:52:17 +08:00
|
|
|
| crate::Criterion::Exactness => {
|
|
|
|
if !words {
|
|
|
|
ranking_rules.push(Box::new(Words::new(terms_matching_strategy)));
|
|
|
|
words = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
_ => {}
|
|
|
|
}
|
|
|
|
match rr {
|
|
|
|
crate::Criterion::Words => {
|
|
|
|
if words {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
ranking_rules.push(Box::new(Words::new(terms_matching_strategy)));
|
|
|
|
words = true;
|
|
|
|
}
|
|
|
|
crate::Criterion::Typo => {
|
|
|
|
if typo {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
typo = true;
|
|
|
|
ranking_rules.push(Box::<Typo>::default());
|
|
|
|
}
|
|
|
|
crate::Criterion::Proximity => {
|
|
|
|
if proximity {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
proximity = true;
|
|
|
|
ranking_rules.push(Box::<Proximity>::default());
|
|
|
|
}
|
|
|
|
crate::Criterion::Attribute => {
|
|
|
|
if attribute {
|
|
|
|
continue;
|
|
|
|
}
|
2023-03-23 16:12:35 +08:00
|
|
|
// todo!();
|
2023-03-13 16:52:17 +08:00
|
|
|
// attribute = false;
|
|
|
|
}
|
|
|
|
crate::Criterion::Sort => {
|
|
|
|
if sort {
|
|
|
|
continue;
|
|
|
|
}
|
2023-03-23 16:12:35 +08:00
|
|
|
// todo!();
|
2023-03-13 16:52:17 +08:00
|
|
|
// sort = false;
|
|
|
|
}
|
|
|
|
crate::Criterion::Exactness => {
|
|
|
|
if exactness {
|
|
|
|
continue;
|
|
|
|
}
|
2023-03-23 16:12:35 +08:00
|
|
|
// todo!();
|
2023-03-13 16:52:17 +08:00
|
|
|
// exactness = false;
|
|
|
|
}
|
|
|
|
crate::Criterion::Asc(field) => {
|
|
|
|
if asc.contains(&field) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
asc.insert(field);
|
2023-03-14 23:37:47 +08:00
|
|
|
// TODO
|
2023-03-13 16:52:17 +08:00
|
|
|
}
|
|
|
|
crate::Criterion::Desc(field) => {
|
|
|
|
if desc.contains(&field) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
desc.insert(field);
|
2023-03-23 16:12:35 +08:00
|
|
|
// todo!();
|
2023-03-13 16:52:17 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
Ok(ranking_rules)
|
|
|
|
}
|
2023-03-06 15:35:01 +08:00
|
|
|
|
|
|
|
#[allow(clippy::too_many_arguments)]
|
2023-03-23 16:15:57 +08:00
|
|
|
pub fn execute_search(
|
|
|
|
ctx: &mut SearchContext,
|
2023-03-23 16:35:53 +08:00
|
|
|
query: &Option<String>,
|
2023-03-13 16:52:17 +08:00
|
|
|
terms_matching_strategy: TermsMatchingStrategy,
|
2023-03-23 16:35:53 +08:00
|
|
|
filters: &Option<Filter>,
|
2023-03-06 15:35:01 +08:00
|
|
|
from: usize,
|
|
|
|
length: usize,
|
2023-03-23 16:35:53 +08:00
|
|
|
words_limit: Option<usize>,
|
2023-03-13 16:52:17 +08:00
|
|
|
placeholder_search_logger: &mut dyn SearchLogger<PlaceholderQuery>,
|
|
|
|
query_graph_logger: &mut dyn SearchLogger<QueryGraph>,
|
2023-03-18 22:04:34 +08:00
|
|
|
) -> Result<SearchResult> {
|
|
|
|
let mut universe = if let Some(filters) = filters {
|
2023-03-07 02:21:55 +08:00
|
|
|
filters.evaluate(ctx.txn, ctx.index)?
|
2023-03-06 15:35:01 +08:00
|
|
|
} else {
|
2023-03-07 02:21:55 +08:00
|
|
|
ctx.index.documents_ids(ctx.txn)?
|
2023-03-06 15:35:01 +08:00
|
|
|
};
|
|
|
|
|
2023-03-23 16:35:53 +08:00
|
|
|
let documents_ids = if let Some(query) = query {
|
|
|
|
// We make sure that the analyzer is aware of the stop words
|
|
|
|
// this ensures that the query builder is able to properly remove them.
|
|
|
|
let mut tokbuilder = TokenizerBuilder::new();
|
|
|
|
let stop_words = ctx.index.stop_words(ctx.txn)?;
|
|
|
|
if let Some(ref stop_words) = stop_words {
|
|
|
|
tokbuilder.stop_words(stop_words);
|
|
|
|
}
|
|
|
|
|
|
|
|
let script_lang_map = ctx.index.script_language(ctx.txn)?;
|
|
|
|
if !script_lang_map.is_empty() {
|
|
|
|
tokbuilder.allow_list(&script_lang_map);
|
|
|
|
}
|
|
|
|
|
|
|
|
let tokenizer = tokbuilder.build();
|
2023-03-23 16:39:16 +08:00
|
|
|
let tokens = tokenizer.tokenize(query);
|
2023-03-23 16:35:53 +08:00
|
|
|
|
|
|
|
let query_terms = located_query_terms_from_string(ctx, tokens, words_limit)?;
|
|
|
|
let graph = QueryGraph::from_query(ctx, query_terms)?;
|
|
|
|
|
2023-03-18 22:04:34 +08:00
|
|
|
universe = resolve_maximally_reduced_query_graph(
|
2023-03-13 16:52:17 +08:00
|
|
|
ctx,
|
|
|
|
&universe,
|
|
|
|
&graph,
|
|
|
|
terms_matching_strategy,
|
|
|
|
query_graph_logger,
|
|
|
|
)?;
|
|
|
|
|
|
|
|
let ranking_rules = get_ranking_rules_for_query_graph_search(ctx, terms_matching_strategy)?;
|
2023-03-23 16:35:53 +08:00
|
|
|
|
2023-03-18 22:04:34 +08:00
|
|
|
bucket_sort(ctx, ranking_rules, &graph, &universe, from, length, query_graph_logger)?
|
2023-03-13 16:52:17 +08:00
|
|
|
} else {
|
|
|
|
let ranking_rules = get_ranking_rules_for_placeholder_search(ctx)?;
|
|
|
|
bucket_sort(
|
|
|
|
ctx,
|
|
|
|
ranking_rules,
|
|
|
|
&PlaceholderQuery,
|
|
|
|
&universe,
|
|
|
|
from,
|
|
|
|
length,
|
|
|
|
placeholder_search_logger,
|
2023-03-18 22:04:34 +08:00
|
|
|
)?
|
|
|
|
};
|
|
|
|
|
|
|
|
Ok(SearchResult {
|
|
|
|
// TODO: correct matching words
|
|
|
|
matching_words: MatchingWords::default(),
|
|
|
|
// TODO: candidates with distinct
|
|
|
|
candidates: universe,
|
|
|
|
documents_ids,
|
|
|
|
})
|
|
|
|
}
|