mirror of
https://github.com/meilisearch/meilisearch.git
synced 2024-11-30 09:04:59 +08:00
Rewrite proximity ranking rule
This commit is contained in:
parent
ae6bb1ce17
commit
01e24dd630
@ -2,44 +2,26 @@
|
|||||||
|
|
||||||
use super::ProximityCondition;
|
use super::ProximityCondition;
|
||||||
use crate::search::new::interner::{DedupInterner, Interned};
|
use crate::search::new::interner::{DedupInterner, Interned};
|
||||||
use crate::search::new::query_graph::QueryNodeData;
|
use crate::search::new::query_term::LocatedQueryTermSubset;
|
||||||
use crate::search::new::query_term::LocatedQueryTerm;
|
use crate::search::new::SearchContext;
|
||||||
use crate::search::new::{QueryNode, SearchContext};
|
|
||||||
use crate::Result;
|
use crate::Result;
|
||||||
|
|
||||||
pub fn build_edges(
|
pub fn build_edges(
|
||||||
_ctx: &mut SearchContext,
|
_ctx: &mut SearchContext,
|
||||||
conditions_interner: &mut DedupInterner<ProximityCondition>,
|
conditions_interner: &mut DedupInterner<ProximityCondition>,
|
||||||
from_node: &QueryNode,
|
left_term: Option<&LocatedQueryTermSubset>,
|
||||||
to_node: &QueryNode,
|
right_term: &LocatedQueryTermSubset,
|
||||||
) -> Result<Vec<(u8, Option<Interned<ProximityCondition>>)>> {
|
) -> Result<Vec<(u32, Interned<ProximityCondition>)>> {
|
||||||
let right_term = match &to_node.data {
|
let right_ngram_length = right_term.term_ids.len();
|
||||||
QueryNodeData::End => return Ok(vec![(0, None)]),
|
|
||||||
QueryNodeData::Deleted | QueryNodeData::Start => return Ok(vec![]),
|
let Some(left_term) = left_term else {
|
||||||
QueryNodeData::Term(term) => term,
|
return Ok(vec![(
|
||||||
|
(right_ngram_length - 1) as u32,
|
||||||
|
conditions_interner.insert(ProximityCondition::Term { term: right_term.clone() }),
|
||||||
|
)])
|
||||||
};
|
};
|
||||||
|
|
||||||
let LocatedQueryTerm { value: right_term_interned, positions: right_positions } = right_term;
|
if left_term.positions.end() + 1 != *right_term.positions.start() {
|
||||||
|
|
||||||
let (right_start_position, right_ngram_length) =
|
|
||||||
(*right_positions.start(), right_positions.len());
|
|
||||||
|
|
||||||
let (left_term_interned, left_end_position) = match &from_node.data {
|
|
||||||
QueryNodeData::Term(LocatedQueryTerm { value, positions }) => (*value, *positions.end()),
|
|
||||||
QueryNodeData::Deleted => return Ok(vec![]),
|
|
||||||
QueryNodeData::Start => {
|
|
||||||
return Ok(vec![(
|
|
||||||
(right_ngram_length - 1) as u8,
|
|
||||||
Some(
|
|
||||||
conditions_interner
|
|
||||||
.insert(ProximityCondition::Term { term: *right_term_interned }),
|
|
||||||
),
|
|
||||||
)])
|
|
||||||
}
|
|
||||||
QueryNodeData::End => return Ok(vec![]),
|
|
||||||
};
|
|
||||||
|
|
||||||
if left_end_position + 1 != right_start_position {
|
|
||||||
// We want to ignore this pair of terms
|
// We want to ignore this pair of terms
|
||||||
// Unconditionally walk through the edge without computing the docids
|
// Unconditionally walk through the edge without computing the docids
|
||||||
// This can happen when, in a query like `the sun flowers are beautiful`, the term
|
// This can happen when, in a query like `the sun flowers are beautiful`, the term
|
||||||
@ -47,30 +29,26 @@ pub fn build_edges(
|
|||||||
// The remaining query graph represents `the sun .. are beautiful`
|
// The remaining query graph represents `the sun .. are beautiful`
|
||||||
// but `sun` and `are` have no proximity condition between them
|
// but `sun` and `are` have no proximity condition between them
|
||||||
return Ok(vec![(
|
return Ok(vec![(
|
||||||
(right_ngram_length - 1) as u8,
|
(right_ngram_length - 1) as u32,
|
||||||
Some(
|
conditions_interner.insert(ProximityCondition::Term { term: right_term.clone() }),
|
||||||
conditions_interner.insert(ProximityCondition::Term { term: *right_term_interned }),
|
|
||||||
),
|
|
||||||
)]);
|
)]);
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut conditions = vec![];
|
let mut conditions = vec![];
|
||||||
for cost in right_ngram_length..(7 + right_ngram_length) {
|
for cost in right_ngram_length..(7 + right_ngram_length) {
|
||||||
let cost = cost as u8;
|
|
||||||
conditions.push((
|
conditions.push((
|
||||||
cost,
|
cost as u32,
|
||||||
Some(conditions_interner.insert(ProximityCondition::Uninit {
|
conditions_interner.insert(ProximityCondition::Uninit {
|
||||||
left_term: left_term_interned,
|
left_term: left_term.clone(),
|
||||||
right_term: *right_term_interned,
|
right_term: right_term.clone(),
|
||||||
right_term_ngram_len: right_ngram_length as u8,
|
cost: cost as u8,
|
||||||
cost,
|
}),
|
||||||
})),
|
|
||||||
))
|
))
|
||||||
}
|
}
|
||||||
|
|
||||||
conditions.push((
|
conditions.push((
|
||||||
(7 + right_ngram_length) as u8,
|
(7 + right_ngram_length) as u32,
|
||||||
Some(conditions_interner.insert(ProximityCondition::Term { term: *right_term_interned })),
|
conditions_interner.insert(ProximityCondition::Term { term: right_term.clone() }),
|
||||||
));
|
));
|
||||||
|
|
||||||
Ok(conditions)
|
Ok(conditions)
|
||||||
|
@ -1,49 +1,37 @@
|
|||||||
#![allow(clippy::too_many_arguments)]
|
#![allow(clippy::too_many_arguments)]
|
||||||
|
|
||||||
use std::iter::FromIterator;
|
|
||||||
|
|
||||||
use super::ProximityCondition;
|
use super::ProximityCondition;
|
||||||
use crate::search::new::db_cache::DatabaseCache;
|
use crate::search::new::interner::Interned;
|
||||||
use crate::search::new::interner::{DedupInterner, Interned};
|
use crate::search::new::query_term::{Phrase, QueryTermSubset};
|
||||||
use crate::search::new::query_term::{Phrase, QueryTerm};
|
use crate::search::new::ranking_rule_graph::ComputedCondition;
|
||||||
use crate::search::new::resolve_query_graph::QueryTermDocIdsCache;
|
use crate::search::new::resolve_query_graph::compute_query_term_subset_docids;
|
||||||
use crate::search::new::SearchContext;
|
use crate::search::new::SearchContext;
|
||||||
use crate::{CboRoaringBitmapCodec, Index, Result};
|
use crate::{CboRoaringBitmapCodec, Result};
|
||||||
use fxhash::FxHashSet;
|
|
||||||
use heed::RoTxn;
|
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
|
use std::collections::BTreeSet;
|
||||||
|
|
||||||
pub fn compute_docids(
|
pub fn compute_docids(
|
||||||
ctx: &mut SearchContext,
|
ctx: &mut SearchContext,
|
||||||
condition: &ProximityCondition,
|
condition: &ProximityCondition,
|
||||||
universe: &RoaringBitmap,
|
universe: &RoaringBitmap,
|
||||||
) -> Result<(RoaringBitmap, FxHashSet<Interned<String>>, FxHashSet<Interned<Phrase>>)> {
|
) -> Result<ComputedCondition> {
|
||||||
let (left_term, right_term, right_term_ngram_len, cost) = match condition {
|
let (left_term, right_term, cost) = match condition {
|
||||||
ProximityCondition::Uninit { left_term, right_term, right_term_ngram_len, cost } => {
|
ProximityCondition::Uninit { left_term, right_term, cost } => {
|
||||||
(*left_term, *right_term, *right_term_ngram_len, *cost)
|
(left_term, right_term, *cost)
|
||||||
}
|
}
|
||||||
ProximityCondition::Term { term } => {
|
ProximityCondition::Term { term } => {
|
||||||
let term_v = ctx.term_interner.get(*term);
|
let mut docids = compute_query_term_subset_docids(ctx, &term.term_subset)?;
|
||||||
return Ok((
|
docids &= universe;
|
||||||
ctx.term_docids
|
return Ok(ComputedCondition {
|
||||||
.get_query_term_docids(
|
docids,
|
||||||
ctx.index,
|
universe_len: universe.len(),
|
||||||
ctx.txn,
|
start_term_subset: None,
|
||||||
&mut ctx.db_cache,
|
end_term_subset: term.clone(),
|
||||||
&ctx.word_interner,
|
});
|
||||||
&ctx.term_interner,
|
|
||||||
&ctx.phrase_interner,
|
|
||||||
*term,
|
|
||||||
)?
|
|
||||||
.clone(),
|
|
||||||
FxHashSet::from_iter(term_v.all_single_words_except_prefix_db()),
|
|
||||||
FxHashSet::from_iter(term_v.all_phrases()),
|
|
||||||
));
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
let left_term = ctx.term_interner.get(left_term);
|
let right_term_ngram_len = right_term.term_ids.len() as u8;
|
||||||
let right_term = ctx.term_interner.get(right_term);
|
|
||||||
|
|
||||||
// e.g. for the simple words `sun .. flower`
|
// e.g. for the simple words `sun .. flower`
|
||||||
// the cost is 5
|
// the cost is 5
|
||||||
@ -57,20 +45,13 @@ pub fn compute_docids(
|
|||||||
let forward_proximity = 1 + cost - right_term_ngram_len;
|
let forward_proximity = 1 + cost - right_term_ngram_len;
|
||||||
let backward_proximity = cost - right_term_ngram_len;
|
let backward_proximity = cost - right_term_ngram_len;
|
||||||
|
|
||||||
let mut used_words = FxHashSet::default();
|
|
||||||
let mut used_phrases = FxHashSet::default();
|
|
||||||
|
|
||||||
let mut docids = RoaringBitmap::new();
|
let mut docids = RoaringBitmap::new();
|
||||||
|
|
||||||
if let Some(right_prefix) = right_term.use_prefix_db {
|
if let Some(right_prefix) = right_term.term_subset.use_prefix_db(ctx) {
|
||||||
for (left_phrase, left_word) in last_word_of_term_iter(left_term, &ctx.phrase_interner) {
|
for (left_phrase, left_word) in last_words_of_term_derivations(ctx, &left_term.term_subset)?
|
||||||
|
{
|
||||||
compute_prefix_edges(
|
compute_prefix_edges(
|
||||||
ctx.index,
|
ctx,
|
||||||
ctx.txn,
|
|
||||||
&mut ctx.db_cache,
|
|
||||||
&mut ctx.term_docids,
|
|
||||||
&ctx.word_interner,
|
|
||||||
&ctx.phrase_interner,
|
|
||||||
left_word,
|
left_word,
|
||||||
right_prefix,
|
right_prefix,
|
||||||
left_phrase,
|
left_phrase,
|
||||||
@ -78,8 +59,6 @@ pub fn compute_docids(
|
|||||||
backward_proximity,
|
backward_proximity,
|
||||||
&mut docids,
|
&mut docids,
|
||||||
universe,
|
universe,
|
||||||
&mut used_words,
|
|
||||||
&mut used_phrases,
|
|
||||||
)?;
|
)?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -91,39 +70,60 @@ pub fn compute_docids(
|
|||||||
// + one-typo/zero-typo, then one-typo/one-typo, then ... until an arbitrary limit has been
|
// + one-typo/zero-typo, then one-typo/one-typo, then ... until an arbitrary limit has been
|
||||||
// reached
|
// reached
|
||||||
|
|
||||||
for (left_phrase, left_word) in last_word_of_term_iter(left_term, &ctx.phrase_interner) {
|
for (left_phrase, left_word) in last_words_of_term_derivations(ctx, &left_term.term_subset)? {
|
||||||
for (right_word, right_phrase) in first_word_of_term_iter(right_term, &ctx.phrase_interner)
|
// Before computing the edges, check that the left word and left phrase
|
||||||
{
|
// aren't disjoint with the universe, but only do it if there is more than
|
||||||
|
// one word derivation to the right.
|
||||||
|
//
|
||||||
|
// This is an optimisation to avoid checking for an excessive number of
|
||||||
|
// pairs.
|
||||||
|
// WAIT, NO.
|
||||||
|
// This should only be done once per node.
|
||||||
|
// Here, we'll potentially do is.. 16 times?
|
||||||
|
// Maybe we should do it at edge-build time instead.
|
||||||
|
// Same for the future attribute ranking rule.
|
||||||
|
let right_derivs = first_word_of_term_iter(ctx, &right_term.term_subset)?;
|
||||||
|
if right_derivs.len() > 1 {
|
||||||
|
let universe = &universe;
|
||||||
|
if let Some(left_phrase) = left_phrase {
|
||||||
|
if universe.is_disjoint(ctx.get_phrase_docids(left_phrase)?) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
} else if let Some(lw_bytes) = ctx.get_db_word_docids(left_word)? {
|
||||||
|
let left_word_docids = CboRoaringBitmapCodec::deserialize_from(lw_bytes)?;
|
||||||
|
if universe.is_disjoint(&left_word_docids) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (right_word, right_phrase) in right_derivs {
|
||||||
compute_non_prefix_edges(
|
compute_non_prefix_edges(
|
||||||
ctx.index,
|
ctx,
|
||||||
ctx.txn,
|
|
||||||
&mut ctx.db_cache,
|
|
||||||
&mut ctx.term_docids,
|
|
||||||
&ctx.word_interner,
|
|
||||||
&ctx.phrase_interner,
|
|
||||||
left_word,
|
left_word,
|
||||||
right_word,
|
right_word,
|
||||||
&[left_phrase, right_phrase].iter().copied().flatten().collect::<Vec<_>>(),
|
left_phrase,
|
||||||
|
right_phrase,
|
||||||
forward_proximity,
|
forward_proximity,
|
||||||
backward_proximity,
|
backward_proximity,
|
||||||
&mut docids,
|
&mut docids,
|
||||||
universe,
|
universe,
|
||||||
&mut used_words,
|
|
||||||
&mut used_phrases,
|
|
||||||
)?;
|
)?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok((docids, used_words, used_phrases))
|
Ok(ComputedCondition {
|
||||||
|
docids,
|
||||||
|
universe_len: universe.len(),
|
||||||
|
// TODO: think about whether we want to reduce the subset,
|
||||||
|
// we probably should!
|
||||||
|
start_term_subset: Some(left_term.clone()),
|
||||||
|
end_term_subset: right_term.clone(),
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn compute_prefix_edges<'ctx>(
|
fn compute_prefix_edges(
|
||||||
index: &Index,
|
ctx: &mut SearchContext,
|
||||||
txn: &'ctx RoTxn,
|
|
||||||
db_cache: &mut DatabaseCache<'ctx>,
|
|
||||||
term_docids: &mut QueryTermDocIdsCache,
|
|
||||||
word_interner: &DedupInterner<String>,
|
|
||||||
phrase_interner: &DedupInterner<Phrase>,
|
|
||||||
left_word: Interned<String>,
|
left_word: Interned<String>,
|
||||||
right_prefix: Interned<String>,
|
right_prefix: Interned<String>,
|
||||||
left_phrase: Option<Interned<Phrase>>,
|
left_phrase: Option<Interned<Phrase>>,
|
||||||
@ -131,21 +131,16 @@ fn compute_prefix_edges<'ctx>(
|
|||||||
backward_proximity: u8,
|
backward_proximity: u8,
|
||||||
docids: &mut RoaringBitmap,
|
docids: &mut RoaringBitmap,
|
||||||
universe: &RoaringBitmap,
|
universe: &RoaringBitmap,
|
||||||
used_words: &mut FxHashSet<Interned<String>>,
|
|
||||||
used_phrases: &mut FxHashSet<Interned<Phrase>>,
|
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
|
let mut used_left_words = BTreeSet::new();
|
||||||
|
let mut used_left_phrases = BTreeSet::new();
|
||||||
|
let mut used_right_prefix = BTreeSet::new();
|
||||||
|
|
||||||
let mut universe = universe.clone();
|
let mut universe = universe.clone();
|
||||||
if let Some(phrase) = left_phrase {
|
if let Some(phrase) = left_phrase {
|
||||||
let phrase_docids = term_docids.get_phrase_docids(
|
let phrase_docids = ctx.get_phrase_docids(phrase)?;
|
||||||
index,
|
|
||||||
txn,
|
|
||||||
db_cache,
|
|
||||||
word_interner,
|
|
||||||
phrase_interner,
|
|
||||||
phrase,
|
|
||||||
)?;
|
|
||||||
if !phrase_docids.is_empty() {
|
if !phrase_docids.is_empty() {
|
||||||
used_phrases.insert(phrase);
|
used_left_phrases.insert(phrase);
|
||||||
}
|
}
|
||||||
universe &= phrase_docids;
|
universe &= phrase_docids;
|
||||||
if universe.is_empty() {
|
if universe.is_empty() {
|
||||||
@ -153,36 +148,28 @@ fn compute_prefix_edges<'ctx>(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if let Some(new_docids) = db_cache.get_word_prefix_pair_proximity_docids(
|
if let Some(new_docids) =
|
||||||
index,
|
ctx.get_db_word_prefix_pair_proximity_docids(left_word, right_prefix, forward_proximity)?
|
||||||
txn,
|
{
|
||||||
word_interner,
|
|
||||||
left_word,
|
|
||||||
right_prefix,
|
|
||||||
forward_proximity,
|
|
||||||
)? {
|
|
||||||
let new_docids = &universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?;
|
let new_docids = &universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?;
|
||||||
if !new_docids.is_empty() {
|
if !new_docids.is_empty() {
|
||||||
used_words.insert(left_word);
|
used_left_words.insert(left_word);
|
||||||
used_words.insert(right_prefix);
|
used_right_prefix.insert(right_prefix);
|
||||||
*docids |= new_docids;
|
*docids |= new_docids;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// No swapping when computing the proximity between a phrase and a word
|
// No swapping when computing the proximity between a phrase and a word
|
||||||
if left_phrase.is_none() {
|
if left_phrase.is_none() {
|
||||||
if let Some(new_docids) = db_cache.get_prefix_word_pair_proximity_docids(
|
if let Some(new_docids) = ctx.get_db_prefix_word_pair_proximity_docids(
|
||||||
index,
|
|
||||||
txn,
|
|
||||||
word_interner,
|
|
||||||
right_prefix,
|
right_prefix,
|
||||||
left_word,
|
left_word,
|
||||||
backward_proximity,
|
backward_proximity,
|
||||||
)? {
|
)? {
|
||||||
let new_docids = &universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?;
|
let new_docids = &universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?;
|
||||||
if !new_docids.is_empty() {
|
if !new_docids.is_empty() {
|
||||||
used_words.insert(left_word);
|
used_left_words.insert(left_word);
|
||||||
used_words.insert(right_prefix);
|
used_right_prefix.insert(right_prefix);
|
||||||
*docids |= new_docids;
|
*docids |= new_docids;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -191,72 +178,59 @@ fn compute_prefix_edges<'ctx>(
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn compute_non_prefix_edges<'ctx>(
|
fn compute_non_prefix_edges(
|
||||||
index: &Index,
|
ctx: &mut SearchContext,
|
||||||
txn: &'ctx RoTxn,
|
|
||||||
db_cache: &mut DatabaseCache<'ctx>,
|
|
||||||
term_docids: &mut QueryTermDocIdsCache,
|
|
||||||
word_interner: &DedupInterner<String>,
|
|
||||||
phrase_interner: &DedupInterner<Phrase>,
|
|
||||||
word1: Interned<String>,
|
word1: Interned<String>,
|
||||||
word2: Interned<String>,
|
word2: Interned<String>,
|
||||||
phrases: &[Interned<Phrase>],
|
left_phrase: Option<Interned<Phrase>>,
|
||||||
|
right_phrase: Option<Interned<Phrase>>,
|
||||||
forward_proximity: u8,
|
forward_proximity: u8,
|
||||||
backward_proximity: u8,
|
backward_proximity: u8,
|
||||||
docids: &mut RoaringBitmap,
|
docids: &mut RoaringBitmap,
|
||||||
universe: &RoaringBitmap,
|
universe: &RoaringBitmap,
|
||||||
used_words: &mut FxHashSet<Interned<String>>,
|
|
||||||
used_phrases: &mut FxHashSet<Interned<Phrase>>,
|
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
|
let mut used_left_phrases = BTreeSet::new();
|
||||||
|
let mut used_right_phrases = BTreeSet::new();
|
||||||
|
let mut used_left_words = BTreeSet::new();
|
||||||
|
let mut used_right_words = BTreeSet::new();
|
||||||
|
|
||||||
let mut universe = universe.clone();
|
let mut universe = universe.clone();
|
||||||
for phrase in phrases {
|
|
||||||
let phrase_docids = term_docids.get_phrase_docids(
|
for phrase in left_phrase.iter().chain(right_phrase.iter()).copied() {
|
||||||
index,
|
let phrase_docids = ctx.get_phrase_docids(phrase)?;
|
||||||
txn,
|
|
||||||
db_cache,
|
|
||||||
word_interner,
|
|
||||||
phrase_interner,
|
|
||||||
*phrase,
|
|
||||||
)?;
|
|
||||||
if !phrase_docids.is_empty() {
|
|
||||||
used_phrases.insert(*phrase);
|
|
||||||
}
|
|
||||||
universe &= phrase_docids;
|
universe &= phrase_docids;
|
||||||
if universe.is_empty() {
|
if universe.is_empty() {
|
||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if let Some(new_docids) = db_cache.get_word_pair_proximity_docids(
|
if let Some(left_phrase) = left_phrase {
|
||||||
index,
|
used_left_phrases.insert(left_phrase);
|
||||||
txn,
|
}
|
||||||
word_interner,
|
if let Some(right_phrase) = right_phrase {
|
||||||
word1,
|
used_right_phrases.insert(right_phrase);
|
||||||
word2,
|
}
|
||||||
forward_proximity,
|
|
||||||
)? {
|
if let Some(new_docids) =
|
||||||
|
ctx.get_db_word_pair_proximity_docids(word1, word2, forward_proximity)?
|
||||||
|
{
|
||||||
let new_docids = &universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?;
|
let new_docids = &universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?;
|
||||||
if !new_docids.is_empty() {
|
if !new_docids.is_empty() {
|
||||||
used_words.insert(word1);
|
used_left_words.insert(word1);
|
||||||
used_words.insert(word2);
|
used_right_words.insert(word2);
|
||||||
*docids |= new_docids;
|
*docids |= new_docids;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if backward_proximity >= 1
|
if backward_proximity >= 1
|
||||||
// no swapping when either term is a phrase
|
// no swapping when either term is a phrase
|
||||||
&& phrases.is_empty()
|
&& left_phrase.is_none() && right_phrase.is_none()
|
||||||
{
|
{
|
||||||
if let Some(new_docids) = db_cache.get_word_pair_proximity_docids(
|
if let Some(new_docids) =
|
||||||
index,
|
ctx.get_db_word_pair_proximity_docids(word2, word1, backward_proximity)?
|
||||||
txn,
|
{
|
||||||
word_interner,
|
|
||||||
word2,
|
|
||||||
word1,
|
|
||||||
backward_proximity,
|
|
||||||
)? {
|
|
||||||
let new_docids = &universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?;
|
let new_docids = &universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?;
|
||||||
if !new_docids.is_empty() {
|
if !new_docids.is_empty() {
|
||||||
used_words.insert(word1);
|
used_left_words.insert(word2);
|
||||||
used_words.insert(word2);
|
used_right_words.insert(word1);
|
||||||
*docids |= new_docids;
|
*docids |= new_docids;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -265,25 +239,41 @@ fn compute_non_prefix_edges<'ctx>(
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn last_word_of_term_iter<'t>(
|
fn last_words_of_term_derivations(
|
||||||
t: &'t QueryTerm,
|
ctx: &mut SearchContext,
|
||||||
phrase_interner: &'t DedupInterner<Phrase>,
|
t: &QueryTermSubset,
|
||||||
) -> impl Iterator<Item = (Option<Interned<Phrase>>, Interned<String>)> + 't {
|
) -> Result<BTreeSet<(Option<Interned<Phrase>>, Interned<String>)>> {
|
||||||
t.all_single_words_except_prefix_db().map(|w| (None, w)).chain(t.all_phrases().flat_map(
|
let mut result = BTreeSet::new();
|
||||||
move |p| {
|
|
||||||
let phrase = phrase_interner.get(p);
|
for w in t.all_single_words_except_prefix_db(ctx)? {
|
||||||
phrase.words.last().unwrap().map(|last| (Some(p), last))
|
result.insert((None, w));
|
||||||
},
|
}
|
||||||
))
|
for p in t.all_phrases(ctx)? {
|
||||||
|
let phrase = ctx.phrase_interner.get(p);
|
||||||
|
let last_term_of_phrase = phrase.words.last().unwrap();
|
||||||
|
if let Some(last_word) = last_term_of_phrase {
|
||||||
|
result.insert((Some(p), *last_word));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(result)
|
||||||
}
|
}
|
||||||
fn first_word_of_term_iter<'t>(
|
fn first_word_of_term_iter(
|
||||||
t: &'t QueryTerm,
|
ctx: &mut SearchContext,
|
||||||
phrase_interner: &'t DedupInterner<Phrase>,
|
t: &QueryTermSubset,
|
||||||
) -> impl Iterator<Item = (Interned<String>, Option<Interned<Phrase>>)> + 't {
|
) -> Result<BTreeSet<(Interned<String>, Option<Interned<Phrase>>)>> {
|
||||||
t.all_single_words_except_prefix_db().map(|w| (w, None)).chain(t.all_phrases().flat_map(
|
let mut result = BTreeSet::new();
|
||||||
move |p| {
|
let all_words = t.all_single_words_except_prefix_db(ctx)?;
|
||||||
let phrase = phrase_interner.get(p);
|
for w in all_words {
|
||||||
phrase.words.first().unwrap().map(|first| (first, Some(p)))
|
result.insert((w, None));
|
||||||
},
|
}
|
||||||
))
|
for p in t.all_phrases(ctx)? {
|
||||||
|
let phrase = ctx.phrase_interner.get(p);
|
||||||
|
let first_term_of_phrase = phrase.words.first().unwrap();
|
||||||
|
if let Some(first_word) = first_term_of_phrase {
|
||||||
|
result.insert((*first_word, Some(p)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(result)
|
||||||
}
|
}
|
||||||
|
@ -1,27 +1,19 @@
|
|||||||
pub mod build;
|
pub mod build;
|
||||||
pub mod compute_docids;
|
pub mod compute_docids;
|
||||||
|
|
||||||
use fxhash::FxHashSet;
|
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
use super::{DeadEndsCache, RankingRuleGraph, RankingRuleGraphTrait};
|
use super::{ComputedCondition, DeadEndsCache, RankingRuleGraph, RankingRuleGraphTrait};
|
||||||
use crate::search::new::interner::{DedupInterner, Interned, MappedInterner};
|
use crate::search::new::interner::{DedupInterner, Interned, MappedInterner};
|
||||||
use crate::search::new::logger::SearchLogger;
|
use crate::search::new::logger::SearchLogger;
|
||||||
use crate::search::new::query_term::{Phrase, QueryTerm};
|
use crate::search::new::query_term::LocatedQueryTermSubset;
|
||||||
use crate::search::new::{QueryGraph, QueryNode, SearchContext};
|
use crate::search::new::{QueryGraph, QueryNode, SearchContext};
|
||||||
use crate::Result;
|
use crate::Result;
|
||||||
|
|
||||||
#[derive(Clone, PartialEq, Eq, Hash)]
|
#[derive(Clone, PartialEq, Eq, Hash)]
|
||||||
pub enum ProximityCondition {
|
pub enum ProximityCondition {
|
||||||
Uninit {
|
Uninit { left_term: LocatedQueryTermSubset, right_term: LocatedQueryTermSubset, cost: u8 },
|
||||||
left_term: Interned<QueryTerm>,
|
Term { term: LocatedQueryTermSubset },
|
||||||
right_term: Interned<QueryTerm>,
|
|
||||||
right_term_ngram_len: u8,
|
|
||||||
cost: u8,
|
|
||||||
},
|
|
||||||
Term {
|
|
||||||
term: Interned<QueryTerm>,
|
|
||||||
},
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub enum ProximityGraph {}
|
pub enum ProximityGraph {}
|
||||||
@ -33,18 +25,17 @@ impl RankingRuleGraphTrait for ProximityGraph {
|
|||||||
ctx: &mut SearchContext,
|
ctx: &mut SearchContext,
|
||||||
condition: &Self::Condition,
|
condition: &Self::Condition,
|
||||||
universe: &RoaringBitmap,
|
universe: &RoaringBitmap,
|
||||||
) -> Result<(roaring::RoaringBitmap, FxHashSet<Interned<String>>, FxHashSet<Interned<Phrase>>)>
|
) -> Result<ComputedCondition> {
|
||||||
{
|
|
||||||
compute_docids::compute_docids(ctx, condition, universe)
|
compute_docids::compute_docids(ctx, condition, universe)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn build_edges(
|
fn build_edges(
|
||||||
ctx: &mut SearchContext,
|
ctx: &mut SearchContext,
|
||||||
conditions_interner: &mut DedupInterner<Self::Condition>,
|
conditions_interner: &mut DedupInterner<Self::Condition>,
|
||||||
source_node: &QueryNode,
|
source_term: Option<&LocatedQueryTermSubset>,
|
||||||
dest_node: &QueryNode,
|
dest_term: &LocatedQueryTermSubset,
|
||||||
) -> Result<Vec<(u8, Option<Interned<Self::Condition>>)>> {
|
) -> Result<Vec<(u32, Interned<Self::Condition>)>> {
|
||||||
build::build_edges(ctx, conditions_interner, source_node, dest_node)
|
build::build_edges(ctx, conditions_interner, source_term, dest_term)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn log_state(
|
fn log_state(
|
||||||
@ -52,8 +43,8 @@ impl RankingRuleGraphTrait for ProximityGraph {
|
|||||||
paths: &[Vec<Interned<ProximityCondition>>],
|
paths: &[Vec<Interned<ProximityCondition>>],
|
||||||
dead_ends_cache: &DeadEndsCache<Self::Condition>,
|
dead_ends_cache: &DeadEndsCache<Self::Condition>,
|
||||||
universe: &RoaringBitmap,
|
universe: &RoaringBitmap,
|
||||||
distances: &MappedInterner<QueryNode, Vec<u16>>,
|
distances: &MappedInterner<QueryNode, Vec<u64>>,
|
||||||
cost: u16,
|
cost: u64,
|
||||||
logger: &mut dyn SearchLogger<QueryGraph>,
|
logger: &mut dyn SearchLogger<QueryGraph>,
|
||||||
) {
|
) {
|
||||||
logger.log_proximity_state(graph, paths, dead_ends_cache, universe, distances, cost);
|
logger.log_proximity_state(graph, paths, dead_ends_cache, universe, distances, cost);
|
||||||
@ -66,8 +57,9 @@ impl RankingRuleGraphTrait for ProximityGraph {
|
|||||||
Ok(format!("{cost}: cost"))
|
Ok(format!("{cost}: cost"))
|
||||||
}
|
}
|
||||||
ProximityCondition::Term { term } => {
|
ProximityCondition::Term { term } => {
|
||||||
let term = ctx.term_interner.get(*term);
|
let original_term = ctx.term_interner.get(term.term_subset.original);
|
||||||
Ok(format!("{} : exists", ctx.word_interner.get(term.original)))
|
let original_word = ctx.word_interner.get(original_term.original);
|
||||||
|
Ok(format!("{original_word} : exists"))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user