2021-03-08 16:12:03 +01:00
use std ::collections ::btree_map ::{ self , BTreeMap } ;
2021-03-09 17:48:05 +01:00
use std ::collections ::hash_map ::HashMap ;
2021-02-22 17:17:01 +01:00
use std ::mem ::take ;
use roaring ::RoaringBitmap ;
2021-02-24 15:37:37 +01:00
use log ::debug ;
2021-02-22 17:17:01 +01:00
2021-03-09 17:48:05 +01:00
use crate ::{ DocumentId , Position , search ::{ query_tree ::QueryKind } } ;
2021-02-22 17:17:01 +01:00
use crate ::search ::query_tree ::{ maximum_proximity , Operation , Query } ;
2021-03-09 17:48:05 +01:00
use crate ::search ::{ build_dfa , WordDerivationsCache } ;
2021-03-09 12:04:52 +01:00
use super ::{ Candidates , Criterion , CriterionResult , Context , query_docids , query_pair_proximity_docids , resolve_query_tree } ;
2021-02-22 17:17:01 +01:00
pub struct Proximity < ' t > {
ctx : & ' t dyn Context ,
query_tree : Option < ( usize , Operation ) > ,
proximity : u8 ,
candidates : Candidates ,
2021-02-25 16:54:41 +01:00
bucket_candidates : RoaringBitmap ,
2021-02-22 17:17:01 +01:00
parent : Option < Box < dyn Criterion + ' t > > ,
candidates_cache : HashMap < ( Operation , u8 ) , Vec < ( Query , Query , RoaringBitmap ) > > ,
2021-03-04 16:07:07 +01:00
plane_sweep_cache : Option < btree_map ::IntoIter < u8 , RoaringBitmap > > ,
2021-02-22 17:17:01 +01:00
}
impl < ' t > Proximity < ' t > {
pub fn initial (
ctx : & ' t dyn Context ,
query_tree : Option < Operation > ,
candidates : Option < RoaringBitmap > ,
2021-03-03 18:16:13 +01:00
) -> Self
2021-02-22 17:17:01 +01:00
{
2021-03-03 18:16:13 +01:00
Proximity {
2021-02-22 17:17:01 +01:00
ctx ,
query_tree : query_tree . map ( | op | ( maximum_proximity ( & op ) , op ) ) ,
proximity : 0 ,
candidates : candidates . map_or_else ( Candidates ::default , Candidates ::Allowed ) ,
2021-02-25 16:54:41 +01:00
bucket_candidates : RoaringBitmap ::new ( ) ,
2021-02-22 17:17:01 +01:00
parent : None ,
candidates_cache : HashMap ::new ( ) ,
2021-03-04 16:07:07 +01:00
plane_sweep_cache : None ,
2021-03-03 18:16:13 +01:00
}
2021-02-22 17:17:01 +01:00
}
2021-03-03 18:16:13 +01:00
pub fn new ( ctx : & ' t dyn Context , parent : Box < dyn Criterion + ' t > ) -> Self {
Proximity {
2021-02-22 17:17:01 +01:00
ctx ,
query_tree : None ,
proximity : 0 ,
candidates : Candidates ::default ( ) ,
2021-02-25 16:54:41 +01:00
bucket_candidates : RoaringBitmap ::new ( ) ,
2021-02-22 17:17:01 +01:00
parent : Some ( parent ) ,
candidates_cache : HashMap ::new ( ) ,
2021-03-04 16:07:07 +01:00
plane_sweep_cache : None ,
2021-03-03 18:16:13 +01:00
}
2021-02-22 17:17:01 +01:00
}
}
impl < ' t > Criterion for Proximity < ' t > {
2021-03-06 11:28:22 +01:00
#[ logging_timer::time( " Proximity::{} " ) ]
2021-03-05 11:02:24 +01:00
fn next ( & mut self , wdcache : & mut WordDerivationsCache ) -> anyhow ::Result < Option < CriterionResult > > {
2021-02-22 17:17:01 +01:00
use Candidates ::{ Allowed , Forbidden } ;
loop {
2021-02-24 15:37:37 +01:00
debug! ( " Proximity at iteration {} (max {:?}) ({:?}) " ,
self . proximity ,
self . query_tree . as_ref ( ) . map ( | ( mp , _ ) | mp ) ,
self . candidates ,
) ;
2021-02-22 17:17:01 +01:00
match ( & mut self . query_tree , & mut self . candidates ) {
( _ , Allowed ( candidates ) ) if candidates . is_empty ( ) = > {
2021-03-01 14:03:12 +01:00
return Ok ( Some ( CriterionResult {
query_tree : self . query_tree . take ( ) . map ( | ( _ , qt ) | qt ) ,
2021-03-09 12:04:52 +01:00
candidates : Some ( take ( & mut self . candidates ) . into_inner ( ) ) ,
2021-03-01 14:03:12 +01:00
bucket_candidates : take ( & mut self . bucket_candidates ) ,
} ) ) ;
2021-02-22 17:17:01 +01:00
} ,
( Some ( ( max_prox , query_tree ) ) , Allowed ( candidates ) ) = > {
if self . proximity as usize > * max_prox {
2021-03-04 16:07:07 +01:00
// reset state to (None, Forbidden(_))
2021-02-22 17:17:01 +01:00
self . query_tree = None ;
self . candidates = Candidates ::default ( ) ;
} else {
2021-03-04 16:07:07 +01:00
let mut new_candidates = if candidates . len ( ) < = 1000 {
if let Some ( cache ) = self . plane_sweep_cache . as_mut ( ) {
match cache . next ( ) {
Some ( ( p , candidates ) ) = > {
self . proximity = p ;
candidates
} ,
None = > {
// reset state to (None, Forbidden(_))
self . query_tree = None ;
self . candidates = Candidates ::default ( ) ;
continue
} ,
}
} else {
let cache = resolve_plane_sweep_candidates (
self . ctx ,
query_tree ,
2021-03-05 11:02:24 +01:00
candidates ,
wdcache ,
2021-03-04 16:07:07 +01:00
) ? ;
self . plane_sweep_cache = Some ( cache . into_iter ( ) ) ;
continue
}
} else { // use set theory based algorithm
resolve_candidates (
self . ctx ,
& query_tree ,
self . proximity ,
& mut self . candidates_cache ,
2021-03-05 11:02:24 +01:00
wdcache ,
2021-03-04 16:07:07 +01:00
) ?
} ;
2021-02-22 17:17:01 +01:00
new_candidates . intersect_with ( & candidates ) ;
candidates . difference_with ( & new_candidates ) ;
self . proximity + = 1 ;
let bucket_candidates = match self . parent {
2021-02-25 16:54:41 +01:00
Some ( _ ) = > take ( & mut self . bucket_candidates ) ,
None = > new_candidates . clone ( ) ,
2021-02-22 17:17:01 +01:00
} ;
return Ok ( Some ( CriterionResult {
query_tree : Some ( query_tree . clone ( ) ) ,
2021-03-09 12:04:52 +01:00
candidates : Some ( new_candidates ) ,
2021-02-22 17:17:01 +01:00
bucket_candidates ,
} ) ) ;
}
} ,
( Some ( ( max_prox , query_tree ) ) , Forbidden ( candidates ) ) = > {
if self . proximity as usize > * max_prox {
self . query_tree = None ;
self . candidates = Candidates ::default ( ) ;
} else {
let mut new_candidates = resolve_candidates (
self . ctx ,
& query_tree ,
self . proximity ,
& mut self . candidates_cache ,
2021-03-05 11:02:24 +01:00
wdcache ,
2021-02-22 17:17:01 +01:00
) ? ;
new_candidates . difference_with ( & candidates ) ;
candidates . union_with ( & new_candidates ) ;
self . proximity + = 1 ;
let bucket_candidates = match self . parent {
2021-02-25 16:54:41 +01:00
Some ( _ ) = > take ( & mut self . bucket_candidates ) ,
None = > new_candidates . clone ( ) ,
2021-02-22 17:17:01 +01:00
} ;
return Ok ( Some ( CriterionResult {
query_tree : Some ( query_tree . clone ( ) ) ,
2021-03-09 12:04:52 +01:00
candidates : Some ( new_candidates ) ,
2021-02-22 17:17:01 +01:00
bucket_candidates ,
} ) ) ;
}
} ,
( None , Allowed ( _ ) ) = > {
let candidates = take ( & mut self . candidates ) . into_inner ( ) ;
return Ok ( Some ( CriterionResult {
query_tree : None ,
2021-03-09 12:04:52 +01:00
candidates : Some ( candidates . clone ( ) ) ,
2021-02-25 16:54:41 +01:00
bucket_candidates : candidates ,
2021-02-22 17:17:01 +01:00
} ) ) ;
} ,
( None , Forbidden ( _ ) ) = > {
match self . parent . as_mut ( ) {
Some ( parent ) = > {
2021-03-05 11:02:24 +01:00
match parent . next ( wdcache ) ? {
2021-02-22 17:17:01 +01:00
Some ( CriterionResult { query_tree , candidates , bucket_candidates } ) = > {
2021-03-09 12:04:52 +01:00
let candidates = match ( & query_tree , candidates ) {
( _ , Some ( candidates ) ) = > candidates ,
( Some ( qt ) , None ) = > resolve_query_tree ( self . ctx , qt , & mut HashMap ::new ( ) , wdcache ) ? ,
( None , None ) = > RoaringBitmap ::new ( ) ,
} ;
2021-03-09 15:55:59 +01:00
if bucket_candidates . is_empty ( ) {
self . bucket_candidates . union_with ( & candidates ) ;
} else {
self . bucket_candidates . union_with ( & bucket_candidates ) ;
}
2021-02-22 17:17:01 +01:00
self . query_tree = query_tree . map ( | op | ( maximum_proximity ( & op ) , op ) ) ;
self . proximity = 0 ;
self . candidates = Candidates ::Allowed ( candidates ) ;
2021-03-04 16:07:07 +01:00
self . plane_sweep_cache = None ;
2021-02-22 17:17:01 +01:00
} ,
None = > return Ok ( None ) ,
}
} ,
None = > return Ok ( None ) ,
}
} ,
}
}
}
}
fn resolve_candidates < ' t > (
ctx : & ' t dyn Context ,
query_tree : & Operation ,
proximity : u8 ,
cache : & mut HashMap < ( Operation , u8 ) , Vec < ( Query , Query , RoaringBitmap ) > > ,
2021-03-05 11:02:24 +01:00
wdcache : & mut WordDerivationsCache ,
2021-02-22 17:17:01 +01:00
) -> anyhow ::Result < RoaringBitmap >
{
fn resolve_operation < ' t > (
ctx : & ' t dyn Context ,
query_tree : & Operation ,
proximity : u8 ,
cache : & mut HashMap < ( Operation , u8 ) , Vec < ( Query , Query , RoaringBitmap ) > > ,
2021-03-05 11:02:24 +01:00
wdcache : & mut WordDerivationsCache ,
2021-02-22 17:17:01 +01:00
) -> anyhow ::Result < Vec < ( Query , Query , RoaringBitmap ) > >
{
use Operation ::{ And , Consecutive , Or , Query } ;
let result = match query_tree {
2021-03-05 11:02:24 +01:00
And ( ops ) = > mdfs ( ctx , ops , proximity , cache , wdcache ) ? ,
2021-02-22 17:17:01 +01:00
Consecutive ( ops ) = > if proximity = = 0 {
2021-03-05 11:02:24 +01:00
mdfs ( ctx , ops , 0 , cache , wdcache ) ?
2021-02-22 17:17:01 +01:00
} else {
Default ::default ( )
} ,
Or ( _ , ops ) = > {
let mut output = Vec ::new ( ) ;
for op in ops {
2021-03-05 11:02:24 +01:00
let result = resolve_operation ( ctx , op , proximity , cache , wdcache ) ? ;
2021-02-22 17:17:01 +01:00
output . extend ( result ) ;
}
output
} ,
Query ( q ) = > if proximity = = 0 {
2021-03-05 11:02:24 +01:00
let candidates = query_docids ( ctx , q , wdcache ) ? ;
2021-02-22 17:17:01 +01:00
vec! [ ( q . clone ( ) , q . clone ( ) , candidates ) ]
} else {
Default ::default ( )
} ,
} ;
Ok ( result )
}
fn mdfs_pair < ' t > (
ctx : & ' t dyn Context ,
left : & Operation ,
right : & Operation ,
proximity : u8 ,
cache : & mut HashMap < ( Operation , u8 ) , Vec < ( Query , Query , RoaringBitmap ) > > ,
2021-03-05 11:02:24 +01:00
wdcache : & mut WordDerivationsCache ,
2021-02-22 17:17:01 +01:00
) -> anyhow ::Result < Vec < ( Query , Query , RoaringBitmap ) > >
{
2021-03-02 14:46:50 +01:00
fn pair_combinations ( mana : u8 , left_max : u8 ) -> impl Iterator < Item = ( u8 , u8 ) > {
( 0 ..= mana . min ( left_max ) ) . map ( move | m | ( m , mana - m ) )
2021-02-22 17:17:01 +01:00
}
2021-03-02 14:46:50 +01:00
let pair_max_proximity = 7 ;
2021-02-22 17:17:01 +01:00
let mut output = Vec ::new ( ) ;
2021-03-02 14:46:50 +01:00
for ( pair_p , left_right_p ) in pair_combinations ( proximity , pair_max_proximity ) {
for ( left_p , right_p ) in pair_combinations ( left_right_p , left_right_p ) {
2021-02-22 17:17:01 +01:00
let left_key = ( left . clone ( ) , left_p ) ;
if ! cache . contains_key ( & left_key ) {
2021-03-05 11:02:24 +01:00
let candidates = resolve_operation ( ctx , left , left_p , cache , wdcache ) ? ;
2021-02-22 17:17:01 +01:00
cache . insert ( left_key . clone ( ) , candidates ) ;
}
let right_key = ( right . clone ( ) , right_p ) ;
if ! cache . contains_key ( & right_key ) {
2021-03-05 11:02:24 +01:00
let candidates = resolve_operation ( ctx , right , right_p , cache , wdcache ) ? ;
2021-02-22 17:17:01 +01:00
cache . insert ( right_key . clone ( ) , candidates ) ;
}
let lefts = cache . get ( & left_key ) . unwrap ( ) ;
let rights = cache . get ( & right_key ) . unwrap ( ) ;
for ( ll , lr , lcandidates ) in lefts {
for ( rl , rr , rcandidates ) in rights {
2021-03-05 11:02:24 +01:00
let mut candidates = query_pair_proximity_docids ( ctx , lr , rl , pair_p + 1 , wdcache ) ? ;
2021-02-22 17:17:01 +01:00
if lcandidates . len ( ) < rcandidates . len ( ) {
candidates . intersect_with ( lcandidates ) ;
candidates . intersect_with ( rcandidates ) ;
} else {
candidates . intersect_with ( rcandidates ) ;
candidates . intersect_with ( lcandidates ) ;
}
if ! candidates . is_empty ( ) {
output . push ( ( ll . clone ( ) , rr . clone ( ) , candidates ) ) ;
}
}
}
}
}
Ok ( output )
}
fn mdfs < ' t > (
ctx : & ' t dyn Context ,
branches : & [ Operation ] ,
proximity : u8 ,
cache : & mut HashMap < ( Operation , u8 ) , Vec < ( Query , Query , RoaringBitmap ) > > ,
2021-03-05 11:02:24 +01:00
wdcache : & mut WordDerivationsCache ,
2021-02-22 17:17:01 +01:00
) -> anyhow ::Result < Vec < ( Query , Query , RoaringBitmap ) > >
{
// Extract the first two elements but gives the tail
// that is just after the first element.
let next = branches . split_first ( ) . map ( | ( h1 , t ) | {
( h1 , t . split_first ( ) . map ( | ( h2 , _ ) | ( h2 , t ) ) )
} ) ;
match next {
2021-03-05 11:02:24 +01:00
Some ( ( head1 , Some ( ( head2 , [ _ ] ) ) ) ) = > mdfs_pair ( ctx , head1 , head2 , proximity , cache , wdcache ) ,
2021-02-22 17:17:01 +01:00
Some ( ( head1 , Some ( ( head2 , tail ) ) ) ) = > {
let mut output = Vec ::new ( ) ;
for p in 0 ..= proximity {
2021-03-05 11:02:24 +01:00
for ( lhead , _ , head_candidates ) in mdfs_pair ( ctx , head1 , head2 , p , cache , wdcache ) ? {
2021-02-22 17:17:01 +01:00
if ! head_candidates . is_empty ( ) {
2021-03-05 11:02:24 +01:00
for ( _ , rtail , mut candidates ) in mdfs ( ctx , tail , proximity - p , cache , wdcache ) ? {
2021-02-22 17:17:01 +01:00
candidates . intersect_with ( & head_candidates ) ;
if ! candidates . is_empty ( ) {
output . push ( ( lhead . clone ( ) , rtail , candidates ) ) ;
}
}
}
}
}
Ok ( output )
} ,
2021-03-05 11:02:24 +01:00
Some ( ( head1 , None ) ) = > resolve_operation ( ctx , head1 , proximity , cache , wdcache ) ,
2021-02-22 17:17:01 +01:00
None = > return Ok ( Default ::default ( ) ) ,
}
}
let mut candidates = RoaringBitmap ::new ( ) ;
2021-03-05 11:02:24 +01:00
for ( _ , _ , cds ) in resolve_operation ( ctx , query_tree , proximity , cache , wdcache ) ? {
2021-02-22 17:17:01 +01:00
candidates . union_with ( & cds ) ;
}
Ok ( candidates )
}
2021-03-03 15:41:09 +01:00
2021-03-08 16:12:03 +01:00
fn resolve_plane_sweep_candidates (
ctx : & dyn Context ,
2021-03-03 15:41:09 +01:00
query_tree : & Operation ,
allowed_candidates : & RoaringBitmap ,
2021-03-05 11:02:24 +01:00
wdcache : & mut WordDerivationsCache ,
2021-03-03 15:41:09 +01:00
) -> anyhow ::Result < BTreeMap < u8 , RoaringBitmap > >
{
/// FIXME may be buggy with query like "new new york"
2021-03-08 16:12:03 +01:00
fn plane_sweep < ' a > (
ctx : & dyn Context ,
operations : & ' a [ Operation ] ,
2021-03-03 15:41:09 +01:00
docid : DocumentId ,
consecutive : bool ,
2021-03-08 16:12:03 +01:00
rocache : & mut HashMap < & ' a Operation , Vec < ( Position , u8 , Position ) > > ,
2021-03-09 17:48:05 +01:00
words_positions : & HashMap < String , RoaringBitmap > ,
2021-03-05 11:02:24 +01:00
wdcache : & mut WordDerivationsCache ,
) -> anyhow ::Result < Vec < ( Position , u8 , Position ) > >
{
fn compute_groups_proximity (
groups : & [ ( usize , ( Position , u8 , Position ) ) ] ,
consecutive : bool ,
) -> Option < ( Position , u8 , Position ) >
{
2021-03-03 15:41:09 +01:00
// take the inner proximity of the first group as initial
2021-03-08 16:27:52 +01:00
let ( _ , ( _ , mut proximity , _ ) ) = groups . first ( ) ? ;
let ( _ , ( left_most_pos , _ , _ ) ) = groups . first ( ) ? ;
let ( _ , ( _ , _ , right_most_pos ) ) = groups . last ( ) ? ;
2021-03-03 15:41:09 +01:00
for pair in groups . windows ( 2 ) {
if let [ ( i1 , ( _ , _ , rpos1 ) ) , ( i2 , ( lpos2 , prox2 , _ ) ) ] = pair {
// if a pair overlap, meaning that they share at least a word, we return None
if rpos1 > = lpos2 { return None }
// if groups are in the good order (query order) we remove 1 to the proximity
// the proximity is clamped to 7
let pair_proximity = if i1 < i2 {
( * lpos2 - * rpos1 - 1 ) . min ( 7 )
} else {
( * lpos2 - * rpos1 ) . min ( 7 )
} ;
proximity + = pair_proximity as u8 + prox2 ;
}
}
// if groups should be consecutives, we will only accept groups with a proximity of 0
if ! consecutive | | proximity = = 0 {
2021-03-08 16:27:52 +01:00
Some ( ( * left_most_pos , proximity , * right_most_pos ) )
2021-03-05 11:02:24 +01:00
} else {
None
}
2021-03-03 15:41:09 +01:00
}
let groups_len = operations . len ( ) ;
let mut groups_positions = Vec ::with_capacity ( groups_len ) ;
for operation in operations {
2021-03-09 17:48:05 +01:00
let positions = resolve_operation ( ctx , operation , docid , rocache , words_positions , wdcache ) ? ;
2021-03-03 15:41:09 +01:00
groups_positions . push ( positions . into_iter ( ) ) ;
}
// Pop top elements of each list.
let mut current = Vec ::with_capacity ( groups_len ) ;
for ( i , positions ) in groups_positions . iter_mut ( ) . enumerate ( ) {
match positions . next ( ) {
Some ( p ) = > current . push ( ( i , p ) ) ,
// if a group return None, it means that the document does not contain all the words,
// we return an empty result.
None = > return Ok ( Vec ::new ( ) ) ,
}
}
// Sort k elements by their positions.
current . sort_unstable_by_key ( | ( _ , p ) | * p ) ;
// Find leftmost and rightmost group and their positions.
let mut leftmost = * current . first ( ) . unwrap ( ) ;
let mut rightmost = * current . last ( ) . unwrap ( ) ;
let mut output = Vec ::new ( ) ;
loop {
// Find the position p of the next elements of a list of the leftmost group.
// If the list is empty, break the loop.
let p = groups_positions [ leftmost . 0 ] . next ( ) . map ( | p | ( leftmost . 0 , p ) ) ;
// let q be the position q of second group of the interval.
let q = current [ 1 ] ;
let mut leftmost_index = 0 ;
// If p > r, then the interval [l, r] is minimal and
// we insert it into the heap according to its size.
if p . map_or ( true , | p | p . 1 > rightmost . 1 ) {
leftmost_index = current [ 0 ] . 0 ;
if let Some ( group ) = compute_groups_proximity ( & current , consecutive ) {
output . push ( group ) ;
}
}
// TODO not sure about breaking here or when the p list is found empty.
let p = match p {
Some ( p ) = > p ,
None = > break ,
} ;
// Remove the leftmost group P in the interval,
// and pop the same group from a list.
current [ leftmost_index ] = p ;
if p . 1 > rightmost . 1 {
// if [l, r] is minimal, let r = p and l = q.
rightmost = p ;
leftmost = q ;
} else {
// Ohterwise, let l = min{p,q}.
leftmost = if p . 1 < q . 1 { p } else { q } ;
}
// Then update the interval and order of groups_positions in the interval.
current . sort_unstable_by_key ( | ( _ , p ) | * p ) ;
}
// Sort the list according to the size and the positions.
output . sort_unstable ( ) ;
Ok ( output )
}
2021-03-08 16:12:03 +01:00
fn resolve_operation < ' a > (
ctx : & dyn Context ,
query_tree : & ' a Operation ,
2021-03-03 15:41:09 +01:00
docid : DocumentId ,
2021-03-08 16:12:03 +01:00
rocache : & mut HashMap < & ' a Operation , Vec < ( Position , u8 , Position ) > > ,
2021-03-09 17:48:05 +01:00
words_positions : & HashMap < String , RoaringBitmap > ,
2021-03-05 11:02:24 +01:00
wdcache : & mut WordDerivationsCache ,
2021-03-08 16:12:03 +01:00
) -> anyhow ::Result < Vec < ( Position , u8 , Position ) > >
{
2021-03-03 15:41:09 +01:00
use Operation ::{ And , Consecutive , Or } ;
2021-03-08 16:12:03 +01:00
if let Some ( result ) = rocache . get ( query_tree ) {
return Ok ( result . clone ( ) ) ;
}
let result = match query_tree {
2021-03-09 17:48:05 +01:00
And ( ops ) = > plane_sweep ( ctx , ops , docid , false , rocache , words_positions , wdcache ) ? ,
Consecutive ( ops ) = > plane_sweep ( ctx , ops , docid , true , rocache , words_positions , wdcache ) ? ,
2021-03-03 15:41:09 +01:00
Or ( _ , ops ) = > {
let mut result = Vec ::new ( ) ;
for op in ops {
2021-03-09 17:48:05 +01:00
result . extend ( resolve_operation ( ctx , op , docid , rocache , words_positions , wdcache ) ? )
2021-03-03 15:41:09 +01:00
}
result . sort_unstable ( ) ;
2021-03-08 16:12:03 +01:00
result
2021-03-03 15:41:09 +01:00
} ,
2021-03-09 17:48:05 +01:00
Operation ::Query ( Query { prefix , kind } ) = > {
let mut result = Vec ::new ( ) ;
match kind {
2021-03-03 15:41:09 +01:00
QueryKind ::Exact { word , .. } = > {
if * prefix {
2021-03-09 17:48:05 +01:00
let iter = word_derivations ( word , true , 0 , & words_positions )
. flat_map ( | positions | positions . iter ( ) . map ( | p | ( p , 0 , p ) ) ) ;
result . extend ( iter ) ;
2021-03-03 15:41:09 +01:00
} else {
2021-03-09 17:48:05 +01:00
if let Some ( positions ) = words_positions . get ( word ) {
result . extend ( positions . iter ( ) . map ( | p | ( p , 0 , p ) ) ) ;
}
2021-03-03 15:41:09 +01:00
}
} ,
QueryKind ::Tolerant { typo , word } = > {
2021-03-09 17:48:05 +01:00
let iter = word_derivations ( word , * prefix , * typo , & words_positions )
. flat_map ( | positions | positions . iter ( ) . map ( | p | ( p , 0 , p ) ) ) ;
2021-03-03 15:41:09 +01:00
result . extend ( iter ) ;
}
}
result . sort_unstable ( ) ;
2021-03-08 16:12:03 +01:00
result
2021-03-03 15:41:09 +01:00
}
2021-03-08 16:12:03 +01:00
} ;
rocache . insert ( query_tree , result . clone ( ) ) ;
Ok ( result )
2021-03-03 15:41:09 +01:00
}
2021-03-09 17:48:05 +01:00
fn word_derivations < ' a > (
word : & str ,
is_prefix : bool ,
max_typo : u8 ,
words_positions : & ' a HashMap < String , RoaringBitmap > ,
) -> impl Iterator < Item = & ' a RoaringBitmap >
{
let dfa = build_dfa ( word , max_typo , is_prefix ) ;
words_positions . iter ( ) . filter_map ( move | ( document_word , positions ) | {
use levenshtein_automata ::Distance ;
match dfa . eval ( document_word ) {
Distance ::Exact ( _ ) = > Some ( positions ) ,
Distance ::AtLeast ( _ ) = > None ,
}
} )
}
2021-03-08 16:12:03 +01:00
let mut resolve_operation_cache = HashMap ::new ( ) ;
2021-03-03 15:41:09 +01:00
let mut candidates = BTreeMap ::new ( ) ;
for docid in allowed_candidates {
2021-03-09 17:48:05 +01:00
let words_positions = ctx . docid_words_positions ( docid ) ? ;
2021-03-08 16:12:03 +01:00
resolve_operation_cache . clear ( ) ;
let positions = resolve_operation (
ctx ,
query_tree ,
docid ,
& mut resolve_operation_cache ,
2021-03-09 17:48:05 +01:00
& words_positions ,
2021-03-08 16:12:03 +01:00
wdcache ,
) ? ;
2021-03-03 15:41:09 +01:00
let best_proximity = positions . into_iter ( ) . min_by_key ( | ( _ , proximity , _ ) | * proximity ) ;
let best_proximity = best_proximity . map ( | ( _ , proximity , _ ) | proximity ) . unwrap_or ( 7 ) ;
candidates . entry ( best_proximity ) . or_insert_with ( RoaringBitmap ::new ) . insert ( docid ) ;
}
Ok ( candidates )
}