2021-03-08 16:12:03 +01:00
|
|
|
use std::collections::btree_map::{self, BTreeMap};
|
2021-03-09 17:48:05 +01:00
|
|
|
use std::collections::hash_map::HashMap;
|
2021-02-22 17:17:01 +01:00
|
|
|
use std::mem::take;
|
|
|
|
|
|
|
|
use roaring::RoaringBitmap;
|
2021-02-24 15:37:37 +01:00
|
|
|
use log::debug;
|
2021-02-22 17:17:01 +01:00
|
|
|
|
2021-03-09 17:48:05 +01:00
|
|
|
use crate::{DocumentId, Position, search::{query_tree::QueryKind}};
|
2021-02-22 17:17:01 +01:00
|
|
|
use crate::search::query_tree::{maximum_proximity, Operation, Query};
|
2021-03-09 17:48:05 +01:00
|
|
|
use crate::search::{build_dfa, WordDerivationsCache};
|
2021-03-09 12:04:52 +01:00
|
|
|
use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids, resolve_query_tree};
|
2021-02-22 17:17:01 +01:00
|
|
|
|
|
|
|
pub struct Proximity<'t> {
|
|
|
|
ctx: &'t dyn Context,
|
|
|
|
query_tree: Option<(usize, Operation)>,
|
|
|
|
proximity: u8,
|
|
|
|
candidates: Candidates,
|
2021-02-25 16:54:41 +01:00
|
|
|
bucket_candidates: RoaringBitmap,
|
2021-02-22 17:17:01 +01:00
|
|
|
parent: Option<Box<dyn Criterion + 't>>,
|
|
|
|
candidates_cache: HashMap<(Operation, u8), Vec<(Query, Query, RoaringBitmap)>>,
|
2021-03-04 16:07:07 +01:00
|
|
|
plane_sweep_cache: Option<btree_map::IntoIter<u8, RoaringBitmap>>,
|
2021-02-22 17:17:01 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
impl<'t> Proximity<'t> {
|
|
|
|
pub fn initial(
|
|
|
|
ctx: &'t dyn Context,
|
|
|
|
query_tree: Option<Operation>,
|
|
|
|
candidates: Option<RoaringBitmap>,
|
2021-03-03 18:16:13 +01:00
|
|
|
) -> Self
|
2021-02-22 17:17:01 +01:00
|
|
|
{
|
2021-03-03 18:16:13 +01:00
|
|
|
Proximity {
|
2021-02-22 17:17:01 +01:00
|
|
|
ctx,
|
|
|
|
query_tree: query_tree.map(|op| (maximum_proximity(&op), op)),
|
|
|
|
proximity: 0,
|
|
|
|
candidates: candidates.map_or_else(Candidates::default, Candidates::Allowed),
|
2021-02-25 16:54:41 +01:00
|
|
|
bucket_candidates: RoaringBitmap::new(),
|
2021-02-22 17:17:01 +01:00
|
|
|
parent: None,
|
|
|
|
candidates_cache: HashMap::new(),
|
2021-03-04 16:07:07 +01:00
|
|
|
plane_sweep_cache: None,
|
2021-03-03 18:16:13 +01:00
|
|
|
}
|
2021-02-22 17:17:01 +01:00
|
|
|
}
|
|
|
|
|
2021-03-03 18:16:13 +01:00
|
|
|
pub fn new(ctx: &'t dyn Context, parent: Box<dyn Criterion + 't>) -> Self {
|
|
|
|
Proximity {
|
2021-02-22 17:17:01 +01:00
|
|
|
ctx,
|
|
|
|
query_tree: None,
|
|
|
|
proximity: 0,
|
|
|
|
candidates: Candidates::default(),
|
2021-02-25 16:54:41 +01:00
|
|
|
bucket_candidates: RoaringBitmap::new(),
|
2021-02-22 17:17:01 +01:00
|
|
|
parent: Some(parent),
|
|
|
|
candidates_cache: HashMap::new(),
|
2021-03-04 16:07:07 +01:00
|
|
|
plane_sweep_cache: None,
|
2021-03-03 18:16:13 +01:00
|
|
|
}
|
2021-02-22 17:17:01 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
impl<'t> Criterion for Proximity<'t> {
|
2021-03-06 11:28:22 +01:00
|
|
|
#[logging_timer::time("Proximity::{}")]
|
2021-03-05 11:02:24 +01:00
|
|
|
fn next(&mut self, wdcache: &mut WordDerivationsCache) -> anyhow::Result<Option<CriterionResult>> {
|
2021-02-22 17:17:01 +01:00
|
|
|
use Candidates::{Allowed, Forbidden};
|
|
|
|
loop {
|
2021-02-24 15:37:37 +01:00
|
|
|
debug!("Proximity at iteration {} (max {:?}) ({:?})",
|
|
|
|
self.proximity,
|
|
|
|
self.query_tree.as_ref().map(|(mp, _)| mp),
|
|
|
|
self.candidates,
|
|
|
|
);
|
|
|
|
|
2021-02-22 17:17:01 +01:00
|
|
|
match (&mut self.query_tree, &mut self.candidates) {
|
|
|
|
(_, Allowed(candidates)) if candidates.is_empty() => {
|
2021-03-01 14:03:12 +01:00
|
|
|
return Ok(Some(CriterionResult {
|
|
|
|
query_tree: self.query_tree.take().map(|(_, qt)| qt),
|
2021-03-09 12:04:52 +01:00
|
|
|
candidates: Some(take(&mut self.candidates).into_inner()),
|
2021-03-01 14:03:12 +01:00
|
|
|
bucket_candidates: take(&mut self.bucket_candidates),
|
|
|
|
}));
|
2021-02-22 17:17:01 +01:00
|
|
|
},
|
|
|
|
(Some((max_prox, query_tree)), Allowed(candidates)) => {
|
|
|
|
if self.proximity as usize > *max_prox {
|
2021-03-04 16:07:07 +01:00
|
|
|
// reset state to (None, Forbidden(_))
|
2021-02-22 17:17:01 +01:00
|
|
|
self.query_tree = None;
|
|
|
|
self.candidates = Candidates::default();
|
|
|
|
} else {
|
2021-03-04 16:07:07 +01:00
|
|
|
let mut new_candidates = if candidates.len() <= 1000 {
|
|
|
|
if let Some(cache) = self.plane_sweep_cache.as_mut() {
|
|
|
|
match cache.next() {
|
|
|
|
Some((p, candidates)) => {
|
|
|
|
self.proximity = p;
|
|
|
|
candidates
|
|
|
|
},
|
|
|
|
None => {
|
|
|
|
// reset state to (None, Forbidden(_))
|
|
|
|
self.query_tree = None;
|
|
|
|
self.candidates = Candidates::default();
|
|
|
|
continue
|
|
|
|
},
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
let cache = resolve_plane_sweep_candidates(
|
|
|
|
self.ctx,
|
|
|
|
query_tree,
|
2021-03-05 11:02:24 +01:00
|
|
|
candidates,
|
|
|
|
wdcache,
|
2021-03-04 16:07:07 +01:00
|
|
|
)?;
|
|
|
|
self.plane_sweep_cache = Some(cache.into_iter());
|
|
|
|
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
} else { // use set theory based algorithm
|
|
|
|
resolve_candidates(
|
|
|
|
self.ctx,
|
|
|
|
&query_tree,
|
|
|
|
self.proximity,
|
|
|
|
&mut self.candidates_cache,
|
2021-03-05 11:02:24 +01:00
|
|
|
wdcache,
|
2021-03-04 16:07:07 +01:00
|
|
|
)?
|
|
|
|
};
|
2021-02-22 17:17:01 +01:00
|
|
|
|
|
|
|
new_candidates.intersect_with(&candidates);
|
|
|
|
candidates.difference_with(&new_candidates);
|
|
|
|
self.proximity += 1;
|
|
|
|
|
|
|
|
let bucket_candidates = match self.parent {
|
2021-02-25 16:54:41 +01:00
|
|
|
Some(_) => take(&mut self.bucket_candidates),
|
|
|
|
None => new_candidates.clone(),
|
2021-02-22 17:17:01 +01:00
|
|
|
};
|
|
|
|
|
|
|
|
return Ok(Some(CriterionResult {
|
|
|
|
query_tree: Some(query_tree.clone()),
|
2021-03-09 12:04:52 +01:00
|
|
|
candidates: Some(new_candidates),
|
2021-02-22 17:17:01 +01:00
|
|
|
bucket_candidates,
|
|
|
|
}));
|
|
|
|
}
|
|
|
|
},
|
|
|
|
(Some((max_prox, query_tree)), Forbidden(candidates)) => {
|
|
|
|
if self.proximity as usize > *max_prox {
|
|
|
|
self.query_tree = None;
|
|
|
|
self.candidates = Candidates::default();
|
|
|
|
} else {
|
|
|
|
let mut new_candidates = resolve_candidates(
|
|
|
|
self.ctx,
|
|
|
|
&query_tree,
|
|
|
|
self.proximity,
|
|
|
|
&mut self.candidates_cache,
|
2021-03-05 11:02:24 +01:00
|
|
|
wdcache,
|
2021-02-22 17:17:01 +01:00
|
|
|
)?;
|
|
|
|
|
|
|
|
new_candidates.difference_with(&candidates);
|
|
|
|
candidates.union_with(&new_candidates);
|
|
|
|
self.proximity += 1;
|
|
|
|
|
|
|
|
let bucket_candidates = match self.parent {
|
2021-02-25 16:54:41 +01:00
|
|
|
Some(_) => take(&mut self.bucket_candidates),
|
|
|
|
None => new_candidates.clone(),
|
2021-02-22 17:17:01 +01:00
|
|
|
};
|
|
|
|
|
|
|
|
return Ok(Some(CriterionResult {
|
|
|
|
query_tree: Some(query_tree.clone()),
|
2021-03-09 12:04:52 +01:00
|
|
|
candidates: Some(new_candidates),
|
2021-02-22 17:17:01 +01:00
|
|
|
bucket_candidates,
|
|
|
|
}));
|
|
|
|
}
|
|
|
|
},
|
|
|
|
(None, Allowed(_)) => {
|
|
|
|
let candidates = take(&mut self.candidates).into_inner();
|
|
|
|
return Ok(Some(CriterionResult {
|
|
|
|
query_tree: None,
|
2021-03-09 12:04:52 +01:00
|
|
|
candidates: Some(candidates.clone()),
|
2021-02-25 16:54:41 +01:00
|
|
|
bucket_candidates: candidates,
|
2021-02-22 17:17:01 +01:00
|
|
|
}));
|
|
|
|
},
|
|
|
|
(None, Forbidden(_)) => {
|
|
|
|
match self.parent.as_mut() {
|
|
|
|
Some(parent) => {
|
2021-03-05 11:02:24 +01:00
|
|
|
match parent.next(wdcache)? {
|
2021-02-22 17:17:01 +01:00
|
|
|
Some(CriterionResult { query_tree, candidates, bucket_candidates }) => {
|
2021-03-09 12:04:52 +01:00
|
|
|
let candidates = match (&query_tree, candidates) {
|
|
|
|
(_, Some(candidates)) => candidates,
|
|
|
|
(Some(qt), None) => resolve_query_tree(self.ctx, qt, &mut HashMap::new(), wdcache)?,
|
|
|
|
(None, None) => RoaringBitmap::new(),
|
|
|
|
};
|
|
|
|
|
2021-03-09 15:55:59 +01:00
|
|
|
if bucket_candidates.is_empty() {
|
|
|
|
self.bucket_candidates.union_with(&candidates);
|
|
|
|
} else {
|
|
|
|
self.bucket_candidates.union_with(&bucket_candidates);
|
|
|
|
}
|
|
|
|
|
2021-02-22 17:17:01 +01:00
|
|
|
self.query_tree = query_tree.map(|op| (maximum_proximity(&op), op));
|
|
|
|
self.proximity = 0;
|
|
|
|
self.candidates = Candidates::Allowed(candidates);
|
2021-03-04 16:07:07 +01:00
|
|
|
self.plane_sweep_cache = None;
|
2021-02-22 17:17:01 +01:00
|
|
|
},
|
|
|
|
None => return Ok(None),
|
|
|
|
}
|
|
|
|
},
|
|
|
|
None => return Ok(None),
|
|
|
|
}
|
|
|
|
},
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
fn resolve_candidates<'t>(
|
|
|
|
ctx: &'t dyn Context,
|
|
|
|
query_tree: &Operation,
|
|
|
|
proximity: u8,
|
|
|
|
cache: &mut HashMap<(Operation, u8), Vec<(Query, Query, RoaringBitmap)>>,
|
2021-03-05 11:02:24 +01:00
|
|
|
wdcache: &mut WordDerivationsCache,
|
2021-02-22 17:17:01 +01:00
|
|
|
) -> anyhow::Result<RoaringBitmap>
|
|
|
|
{
|
|
|
|
fn resolve_operation<'t>(
|
|
|
|
ctx: &'t dyn Context,
|
|
|
|
query_tree: &Operation,
|
|
|
|
proximity: u8,
|
|
|
|
cache: &mut HashMap<(Operation, u8), Vec<(Query, Query, RoaringBitmap)>>,
|
2021-03-05 11:02:24 +01:00
|
|
|
wdcache: &mut WordDerivationsCache,
|
2021-02-22 17:17:01 +01:00
|
|
|
) -> anyhow::Result<Vec<(Query, Query, RoaringBitmap)>>
|
|
|
|
{
|
|
|
|
use Operation::{And, Consecutive, Or, Query};
|
|
|
|
|
|
|
|
let result = match query_tree {
|
2021-03-05 11:02:24 +01:00
|
|
|
And(ops) => mdfs(ctx, ops, proximity, cache, wdcache)?,
|
2021-02-22 17:17:01 +01:00
|
|
|
Consecutive(ops) => if proximity == 0 {
|
2021-03-05 11:02:24 +01:00
|
|
|
mdfs(ctx, ops, 0, cache, wdcache)?
|
2021-02-22 17:17:01 +01:00
|
|
|
} else {
|
|
|
|
Default::default()
|
|
|
|
},
|
|
|
|
Or(_, ops) => {
|
|
|
|
let mut output = Vec::new();
|
|
|
|
for op in ops {
|
2021-03-05 11:02:24 +01:00
|
|
|
let result = resolve_operation(ctx, op, proximity, cache, wdcache)?;
|
2021-02-22 17:17:01 +01:00
|
|
|
output.extend(result);
|
|
|
|
}
|
|
|
|
output
|
|
|
|
},
|
|
|
|
Query(q) => if proximity == 0 {
|
2021-03-05 11:02:24 +01:00
|
|
|
let candidates = query_docids(ctx, q, wdcache)?;
|
2021-02-22 17:17:01 +01:00
|
|
|
vec![(q.clone(), q.clone(), candidates)]
|
|
|
|
} else {
|
|
|
|
Default::default()
|
|
|
|
},
|
|
|
|
};
|
|
|
|
|
|
|
|
Ok(result)
|
|
|
|
}
|
|
|
|
|
|
|
|
fn mdfs_pair<'t>(
|
|
|
|
ctx: &'t dyn Context,
|
|
|
|
left: &Operation,
|
|
|
|
right: &Operation,
|
|
|
|
proximity: u8,
|
|
|
|
cache: &mut HashMap<(Operation, u8), Vec<(Query, Query, RoaringBitmap)>>,
|
2021-03-05 11:02:24 +01:00
|
|
|
wdcache: &mut WordDerivationsCache,
|
2021-02-22 17:17:01 +01:00
|
|
|
) -> anyhow::Result<Vec<(Query, Query, RoaringBitmap)>>
|
|
|
|
{
|
2021-03-02 14:46:50 +01:00
|
|
|
fn pair_combinations(mana: u8, left_max: u8) -> impl Iterator<Item = (u8, u8)> {
|
|
|
|
(0..=mana.min(left_max)).map(move |m| (m, mana - m))
|
2021-02-22 17:17:01 +01:00
|
|
|
}
|
|
|
|
|
2021-03-02 14:46:50 +01:00
|
|
|
let pair_max_proximity = 7;
|
|
|
|
|
2021-02-22 17:17:01 +01:00
|
|
|
let mut output = Vec::new();
|
|
|
|
|
2021-03-02 14:46:50 +01:00
|
|
|
for (pair_p, left_right_p) in pair_combinations(proximity, pair_max_proximity) {
|
|
|
|
for (left_p, right_p) in pair_combinations(left_right_p, left_right_p) {
|
2021-02-22 17:17:01 +01:00
|
|
|
let left_key = (left.clone(), left_p);
|
|
|
|
if !cache.contains_key(&left_key) {
|
2021-03-05 11:02:24 +01:00
|
|
|
let candidates = resolve_operation(ctx, left, left_p, cache, wdcache)?;
|
2021-02-22 17:17:01 +01:00
|
|
|
cache.insert(left_key.clone(), candidates);
|
|
|
|
}
|
|
|
|
|
|
|
|
let right_key = (right.clone(), right_p);
|
|
|
|
if !cache.contains_key(&right_key) {
|
2021-03-05 11:02:24 +01:00
|
|
|
let candidates = resolve_operation(ctx, right, right_p, cache, wdcache)?;
|
2021-02-22 17:17:01 +01:00
|
|
|
cache.insert(right_key.clone(), candidates);
|
|
|
|
}
|
|
|
|
|
|
|
|
let lefts = cache.get(&left_key).unwrap();
|
|
|
|
let rights = cache.get(&right_key).unwrap();
|
|
|
|
|
|
|
|
for (ll, lr, lcandidates) in lefts {
|
|
|
|
for (rl, rr, rcandidates) in rights {
|
2021-03-05 11:02:24 +01:00
|
|
|
let mut candidates = query_pair_proximity_docids(ctx, lr, rl, pair_p + 1, wdcache)?;
|
2021-02-22 17:17:01 +01:00
|
|
|
if lcandidates.len() < rcandidates.len() {
|
|
|
|
candidates.intersect_with(lcandidates);
|
|
|
|
candidates.intersect_with(rcandidates);
|
|
|
|
} else {
|
|
|
|
candidates.intersect_with(rcandidates);
|
|
|
|
candidates.intersect_with(lcandidates);
|
|
|
|
}
|
|
|
|
if !candidates.is_empty() {
|
|
|
|
output.push((ll.clone(), rr.clone(), candidates));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
Ok(output)
|
|
|
|
}
|
|
|
|
|
|
|
|
fn mdfs<'t>(
|
|
|
|
ctx: &'t dyn Context,
|
|
|
|
branches: &[Operation],
|
|
|
|
proximity: u8,
|
|
|
|
cache: &mut HashMap<(Operation, u8), Vec<(Query, Query, RoaringBitmap)>>,
|
2021-03-05 11:02:24 +01:00
|
|
|
wdcache: &mut WordDerivationsCache,
|
2021-02-22 17:17:01 +01:00
|
|
|
) -> anyhow::Result<Vec<(Query, Query, RoaringBitmap)>>
|
|
|
|
{
|
|
|
|
// Extract the first two elements but gives the tail
|
|
|
|
// that is just after the first element.
|
|
|
|
let next = branches.split_first().map(|(h1, t)| {
|
|
|
|
(h1, t.split_first().map(|(h2, _)| (h2, t)))
|
|
|
|
});
|
|
|
|
|
|
|
|
match next {
|
2021-03-05 11:02:24 +01:00
|
|
|
Some((head1, Some((head2, [_])))) => mdfs_pair(ctx, head1, head2, proximity, cache, wdcache),
|
2021-02-22 17:17:01 +01:00
|
|
|
Some((head1, Some((head2, tail)))) => {
|
|
|
|
let mut output = Vec::new();
|
|
|
|
for p in 0..=proximity {
|
2021-03-05 11:02:24 +01:00
|
|
|
for (lhead, _, head_candidates) in mdfs_pair(ctx, head1, head2, p, cache, wdcache)? {
|
2021-02-22 17:17:01 +01:00
|
|
|
if !head_candidates.is_empty() {
|
2021-03-05 11:02:24 +01:00
|
|
|
for (_, rtail, mut candidates) in mdfs(ctx, tail, proximity - p, cache, wdcache)? {
|
2021-02-22 17:17:01 +01:00
|
|
|
candidates.intersect_with(&head_candidates);
|
|
|
|
if !candidates.is_empty() {
|
|
|
|
output.push((lhead.clone(), rtail, candidates));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
Ok(output)
|
|
|
|
},
|
2021-03-05 11:02:24 +01:00
|
|
|
Some((head1, None)) => resolve_operation(ctx, head1, proximity, cache, wdcache),
|
2021-02-22 17:17:01 +01:00
|
|
|
None => return Ok(Default::default()),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
let mut candidates = RoaringBitmap::new();
|
2021-03-05 11:02:24 +01:00
|
|
|
for (_, _, cds) in resolve_operation(ctx, query_tree, proximity, cache, wdcache)? {
|
2021-02-22 17:17:01 +01:00
|
|
|
candidates.union_with(&cds);
|
|
|
|
}
|
|
|
|
Ok(candidates)
|
|
|
|
}
|
2021-03-03 15:41:09 +01:00
|
|
|
|
2021-03-08 16:12:03 +01:00
|
|
|
fn resolve_plane_sweep_candidates(
|
|
|
|
ctx: &dyn Context,
|
2021-03-03 15:41:09 +01:00
|
|
|
query_tree: &Operation,
|
|
|
|
allowed_candidates: &RoaringBitmap,
|
2021-03-05 11:02:24 +01:00
|
|
|
wdcache: &mut WordDerivationsCache,
|
2021-03-03 15:41:09 +01:00
|
|
|
) -> anyhow::Result<BTreeMap<u8, RoaringBitmap>>
|
|
|
|
{
|
|
|
|
/// FIXME may be buggy with query like "new new york"
|
2021-03-08 16:12:03 +01:00
|
|
|
fn plane_sweep<'a>(
|
|
|
|
ctx: &dyn Context,
|
|
|
|
operations: &'a [Operation],
|
2021-03-03 15:41:09 +01:00
|
|
|
docid: DocumentId,
|
|
|
|
consecutive: bool,
|
2021-03-08 16:12:03 +01:00
|
|
|
rocache: &mut HashMap<&'a Operation, Vec<(Position, u8, Position)>>,
|
2021-03-09 17:48:05 +01:00
|
|
|
words_positions: &HashMap<String, RoaringBitmap>,
|
2021-03-05 11:02:24 +01:00
|
|
|
wdcache: &mut WordDerivationsCache,
|
|
|
|
) -> anyhow::Result<Vec<(Position, u8, Position)>>
|
|
|
|
{
|
|
|
|
fn compute_groups_proximity(
|
|
|
|
groups: &[(usize, (Position, u8, Position))],
|
|
|
|
consecutive: bool,
|
|
|
|
) -> Option<(Position, u8, Position)>
|
|
|
|
{
|
2021-03-03 15:41:09 +01:00
|
|
|
// take the inner proximity of the first group as initial
|
2021-03-08 16:27:52 +01:00
|
|
|
let (_, (_, mut proximity, _)) = groups.first()?;
|
|
|
|
let (_, (left_most_pos, _, _)) = groups.first()?;
|
|
|
|
let (_, (_, _, right_most_pos)) = groups.last()?;
|
2021-03-03 15:41:09 +01:00
|
|
|
|
|
|
|
for pair in groups.windows(2) {
|
|
|
|
if let [(i1, (_, _, rpos1)), (i2, (lpos2, prox2, _))] = pair {
|
|
|
|
// if a pair overlap, meaning that they share at least a word, we return None
|
|
|
|
if rpos1 >= lpos2 { return None }
|
|
|
|
// if groups are in the good order (query order) we remove 1 to the proximity
|
|
|
|
// the proximity is clamped to 7
|
|
|
|
let pair_proximity = if i1 < i2 {
|
|
|
|
(*lpos2 - *rpos1 - 1).min(7)
|
|
|
|
} else {
|
|
|
|
(*lpos2 - *rpos1).min(7)
|
|
|
|
};
|
|
|
|
|
|
|
|
proximity += pair_proximity as u8 + prox2;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// if groups should be consecutives, we will only accept groups with a proximity of 0
|
|
|
|
if !consecutive || proximity == 0 {
|
2021-03-08 16:27:52 +01:00
|
|
|
Some((*left_most_pos, proximity, *right_most_pos))
|
2021-03-05 11:02:24 +01:00
|
|
|
} else {
|
|
|
|
None
|
|
|
|
}
|
2021-03-03 15:41:09 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
let groups_len = operations.len();
|
|
|
|
let mut groups_positions = Vec::with_capacity(groups_len);
|
|
|
|
|
|
|
|
for operation in operations {
|
2021-03-09 17:48:05 +01:00
|
|
|
let positions = resolve_operation(ctx, operation, docid, rocache, words_positions, wdcache)?;
|
2021-03-03 15:41:09 +01:00
|
|
|
groups_positions.push(positions.into_iter());
|
|
|
|
}
|
|
|
|
|
|
|
|
// Pop top elements of each list.
|
|
|
|
let mut current = Vec::with_capacity(groups_len);
|
|
|
|
for (i, positions) in groups_positions.iter_mut().enumerate() {
|
|
|
|
match positions.next() {
|
|
|
|
Some(p) => current.push((i, p)),
|
|
|
|
// if a group return None, it means that the document does not contain all the words,
|
|
|
|
// we return an empty result.
|
|
|
|
None => return Ok(Vec::new()),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Sort k elements by their positions.
|
|
|
|
current.sort_unstable_by_key(|(_, p)| *p);
|
|
|
|
|
|
|
|
// Find leftmost and rightmost group and their positions.
|
|
|
|
let mut leftmost = *current.first().unwrap();
|
|
|
|
let mut rightmost = *current.last().unwrap();
|
|
|
|
|
|
|
|
let mut output = Vec::new();
|
|
|
|
loop {
|
|
|
|
// Find the position p of the next elements of a list of the leftmost group.
|
|
|
|
// If the list is empty, break the loop.
|
|
|
|
let p = groups_positions[leftmost.0].next().map(|p| (leftmost.0, p));
|
|
|
|
|
|
|
|
// let q be the position q of second group of the interval.
|
|
|
|
let q = current[1];
|
|
|
|
|
|
|
|
let mut leftmost_index = 0;
|
|
|
|
|
|
|
|
// If p > r, then the interval [l, r] is minimal and
|
|
|
|
// we insert it into the heap according to its size.
|
|
|
|
if p.map_or(true, |p| p.1 > rightmost.1) {
|
|
|
|
leftmost_index = current[0].0;
|
|
|
|
if let Some(group) = compute_groups_proximity(¤t, consecutive) {
|
|
|
|
output.push(group);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// TODO not sure about breaking here or when the p list is found empty.
|
|
|
|
let p = match p {
|
|
|
|
Some(p) => p,
|
|
|
|
None => break,
|
|
|
|
};
|
|
|
|
|
|
|
|
// Remove the leftmost group P in the interval,
|
|
|
|
// and pop the same group from a list.
|
|
|
|
current[leftmost_index] = p;
|
|
|
|
|
|
|
|
if p.1 > rightmost.1 {
|
|
|
|
// if [l, r] is minimal, let r = p and l = q.
|
|
|
|
rightmost = p;
|
|
|
|
leftmost = q;
|
|
|
|
} else {
|
|
|
|
// Ohterwise, let l = min{p,q}.
|
|
|
|
leftmost = if p.1 < q.1 { p } else { q };
|
|
|
|
}
|
|
|
|
|
|
|
|
// Then update the interval and order of groups_positions in the interval.
|
|
|
|
current.sort_unstable_by_key(|(_, p)| *p);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Sort the list according to the size and the positions.
|
|
|
|
output.sort_unstable();
|
|
|
|
|
|
|
|
Ok(output)
|
|
|
|
}
|
|
|
|
|
2021-03-08 16:12:03 +01:00
|
|
|
fn resolve_operation<'a>(
|
|
|
|
ctx: &dyn Context,
|
|
|
|
query_tree: &'a Operation,
|
2021-03-03 15:41:09 +01:00
|
|
|
docid: DocumentId,
|
2021-03-08 16:12:03 +01:00
|
|
|
rocache: &mut HashMap<&'a Operation, Vec<(Position, u8, Position)>>,
|
2021-03-09 17:48:05 +01:00
|
|
|
words_positions: &HashMap<String, RoaringBitmap>,
|
2021-03-05 11:02:24 +01:00
|
|
|
wdcache: &mut WordDerivationsCache,
|
2021-03-08 16:12:03 +01:00
|
|
|
) -> anyhow::Result<Vec<(Position, u8, Position)>>
|
|
|
|
{
|
2021-03-03 15:41:09 +01:00
|
|
|
use Operation::{And, Consecutive, Or};
|
|
|
|
|
2021-03-08 16:12:03 +01:00
|
|
|
if let Some(result) = rocache.get(query_tree) {
|
|
|
|
return Ok(result.clone());
|
|
|
|
}
|
|
|
|
|
|
|
|
let result = match query_tree {
|
2021-03-09 17:48:05 +01:00
|
|
|
And(ops) => plane_sweep(ctx, ops, docid, false, rocache, words_positions, wdcache)?,
|
|
|
|
Consecutive(ops) => plane_sweep(ctx, ops, docid, true, rocache, words_positions, wdcache)?,
|
2021-03-03 15:41:09 +01:00
|
|
|
Or(_, ops) => {
|
|
|
|
let mut result = Vec::new();
|
|
|
|
for op in ops {
|
2021-03-09 17:48:05 +01:00
|
|
|
result.extend(resolve_operation(ctx, op, docid, rocache, words_positions, wdcache)?)
|
2021-03-03 15:41:09 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
result.sort_unstable();
|
2021-03-08 16:12:03 +01:00
|
|
|
result
|
2021-03-03 15:41:09 +01:00
|
|
|
},
|
2021-03-09 17:48:05 +01:00
|
|
|
Operation::Query(Query { prefix, kind }) => {
|
|
|
|
let mut result = Vec::new();
|
|
|
|
match kind {
|
2021-03-03 15:41:09 +01:00
|
|
|
QueryKind::Exact { word, .. } => {
|
|
|
|
if *prefix {
|
2021-03-09 17:48:05 +01:00
|
|
|
let iter = word_derivations(word, true, 0, &words_positions)
|
|
|
|
.flat_map(|positions| positions.iter().map(|p| (p, 0, p)));
|
|
|
|
result.extend(iter);
|
2021-03-03 15:41:09 +01:00
|
|
|
} else {
|
2021-03-09 17:48:05 +01:00
|
|
|
if let Some(positions) = words_positions.get(word) {
|
|
|
|
result.extend(positions.iter().map(|p| (p, 0, p)));
|
|
|
|
}
|
2021-03-03 15:41:09 +01:00
|
|
|
}
|
|
|
|
},
|
|
|
|
QueryKind::Tolerant { typo, word } => {
|
2021-03-09 17:48:05 +01:00
|
|
|
let iter = word_derivations(word, *prefix, *typo, &words_positions)
|
|
|
|
.flat_map(|positions| positions.iter().map(|p| (p, 0, p)));
|
2021-03-03 15:41:09 +01:00
|
|
|
result.extend(iter);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
result.sort_unstable();
|
2021-03-08 16:12:03 +01:00
|
|
|
result
|
2021-03-03 15:41:09 +01:00
|
|
|
}
|
2021-03-08 16:12:03 +01:00
|
|
|
};
|
|
|
|
|
|
|
|
rocache.insert(query_tree, result.clone());
|
|
|
|
Ok(result)
|
2021-03-03 15:41:09 +01:00
|
|
|
}
|
|
|
|
|
2021-03-09 17:48:05 +01:00
|
|
|
fn word_derivations<'a>(
|
|
|
|
word: &str,
|
|
|
|
is_prefix: bool,
|
|
|
|
max_typo: u8,
|
|
|
|
words_positions: &'a HashMap<String, RoaringBitmap>,
|
|
|
|
) -> impl Iterator<Item = &'a RoaringBitmap>
|
|
|
|
{
|
|
|
|
let dfa = build_dfa(word, max_typo, is_prefix);
|
|
|
|
words_positions.iter().filter_map(move |(document_word, positions)| {
|
|
|
|
use levenshtein_automata::Distance;
|
|
|
|
match dfa.eval(document_word) {
|
|
|
|
Distance::Exact(_) => Some(positions),
|
|
|
|
Distance::AtLeast(_) => None,
|
|
|
|
}
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
2021-03-08 16:12:03 +01:00
|
|
|
let mut resolve_operation_cache = HashMap::new();
|
2021-03-03 15:41:09 +01:00
|
|
|
let mut candidates = BTreeMap::new();
|
|
|
|
for docid in allowed_candidates {
|
2021-03-09 17:48:05 +01:00
|
|
|
let words_positions = ctx.docid_words_positions(docid)?;
|
2021-03-08 16:12:03 +01:00
|
|
|
resolve_operation_cache.clear();
|
|
|
|
let positions = resolve_operation(
|
|
|
|
ctx,
|
|
|
|
query_tree,
|
|
|
|
docid,
|
|
|
|
&mut resolve_operation_cache,
|
2021-03-09 17:48:05 +01:00
|
|
|
&words_positions,
|
2021-03-08 16:12:03 +01:00
|
|
|
wdcache,
|
|
|
|
)?;
|
2021-03-03 15:41:09 +01:00
|
|
|
let best_proximity = positions.into_iter().min_by_key(|(_, proximity, _)| *proximity);
|
|
|
|
let best_proximity = best_proximity.map(|(_, proximity, _)| proximity).unwrap_or(7);
|
|
|
|
candidates.entry(best_proximity).or_insert_with(RoaringBitmap::new).insert(docid);
|
|
|
|
}
|
|
|
|
|
|
|
|
Ok(candidates)
|
|
|
|
}
|