2021-06-16 18:33:33 +02:00
use std ::cmp ::{ self , Ordering } ;
2021-04-13 15:06:12 +02:00
use std ::collections ::binary_heap ::PeekMut ;
2021-06-16 18:33:33 +02:00
use std ::collections ::{ btree_map , BTreeMap , BinaryHeap , HashMap } ;
2021-10-04 16:36:11 +02:00
use std ::iter ::Peekable ;
2021-03-11 17:31:02 +01:00
use std ::mem ::take ;
2021-03-11 11:48:55 +01:00
use roaring ::RoaringBitmap ;
2021-06-16 18:33:33 +02:00
use super ::{ resolve_query_tree , Context , Criterion , CriterionParameters , CriterionResult } ;
2021-03-11 11:48:55 +01:00
use crate ::search ::criteria ::Query ;
2021-03-11 17:31:02 +01:00
use crate ::search ::query_tree ::{ Operation , QueryKind } ;
2021-06-16 18:33:33 +02:00
use crate ::search ::{ build_dfa , word_derivations , WordDerivationsCache } ;
2021-10-05 11:18:42 +02:00
use crate ::Result ;
2021-03-11 11:48:55 +01:00
2021-04-13 18:25:38 +02:00
/// To be able to divide integers by the number of words in the query
/// we want to find a multiplier that allow us to divide by any number between 1 and 10.
2021-04-26 11:30:42 +02:00
/// We chose the LCM of all numbers between 1 and 10 as the multiplier (https://en.wikipedia.org/wiki/Least_common_multiple).
2021-04-13 18:25:38 +02:00
const LCM_10_FIRST_NUMBERS : u32 = 2520 ;
2021-04-15 10:44:27 +02:00
2021-04-28 18:01:23 +02:00
/// Threshold on the number of candidates that will make
/// the system to choose between one algorithm or another.
2021-10-05 18:52:14 +02:00
const CANDIDATES_THRESHOLD : u64 = 500 ;
2021-04-28 18:01:23 +02:00
2021-05-05 20:46:56 +02:00
type FlattenedQueryTree = Vec < Vec < Vec < Query > > > ;
2021-05-10 12:33:37 +02:00
2021-03-11 11:48:55 +01:00
pub struct Attribute < ' t > {
2021-03-31 19:23:02 +02:00
ctx : & ' t dyn Context < ' t > ,
2021-05-05 20:46:56 +02:00
state : Option < ( Operation , FlattenedQueryTree , RoaringBitmap ) > ,
2021-03-11 11:48:55 +01:00
bucket_candidates : RoaringBitmap ,
2021-03-23 15:25:46 +01:00
parent : Box < dyn Criterion + ' t > ,
2021-10-04 16:36:11 +02:00
linear_buckets : Option < btree_map ::IntoIter < u64 , RoaringBitmap > > ,
set_buckets : Option < BinaryHeap < Branch < ' t > > > ,
2021-03-11 11:48:55 +01:00
}
impl < ' t > Attribute < ' t > {
2021-03-31 19:23:02 +02:00
pub fn new ( ctx : & ' t dyn Context < ' t > , parent : Box < dyn Criterion + ' t > ) -> Self {
2021-03-11 11:48:55 +01:00
Attribute {
ctx ,
2021-05-05 20:46:56 +02:00
state : None ,
2021-03-11 11:48:55 +01:00
bucket_candidates : RoaringBitmap ::new ( ) ,
2021-03-23 15:25:46 +01:00
parent ,
2021-10-04 16:36:11 +02:00
linear_buckets : None ,
set_buckets : None ,
2021-03-11 11:48:55 +01:00
}
}
}
impl < ' t > Criterion for Attribute < ' t > {
#[ logging_timer::time( " Attribute::{} " ) ]
2021-06-14 16:46:19 +02:00
fn next ( & mut self , params : & mut CriterionParameters ) -> Result < Option < CriterionResult > > {
2021-04-28 18:01:23 +02:00
// remove excluded candidates when next is called, instead of doing it in the loop.
2021-05-05 20:46:56 +02:00
if let Some ( ( _ , _ , allowed_candidates ) ) = self . state . as_mut ( ) {
* allowed_candidates - = params . excluded_candidates ;
2021-04-28 18:01:23 +02:00
}
2021-03-11 17:31:02 +01:00
loop {
2021-05-05 20:46:56 +02:00
match self . state . take ( ) {
Some ( ( query_tree , _ , allowed_candidates ) ) if allowed_candidates . is_empty ( ) = > {
2021-03-11 17:31:02 +01:00
return Ok ( Some ( CriterionResult {
2021-05-05 20:46:56 +02:00
query_tree : Some ( query_tree ) ,
candidates : Some ( RoaringBitmap ::new ( ) ) ,
2021-05-10 12:33:37 +02:00
filtered_candidates : None ,
2021-05-05 20:46:56 +02:00
bucket_candidates : Some ( take ( & mut self . bucket_candidates ) ) ,
2021-03-11 17:31:02 +01:00
} ) ) ;
2021-06-16 18:33:33 +02:00
}
2021-05-05 20:46:56 +02:00
Some ( ( query_tree , flattened_query_tree , mut allowed_candidates ) ) = > {
let found_candidates = if allowed_candidates . len ( ) < CANDIDATES_THRESHOLD {
2021-10-04 16:36:11 +02:00
let linear_buckets = match self . linear_buckets . as_mut ( ) {
Some ( linear_buckets ) = > linear_buckets ,
2021-03-31 19:23:02 +02:00
None = > {
2021-10-04 16:36:11 +02:00
let new_buckets = initialize_linear_buckets (
2021-06-16 18:33:33 +02:00
self . ctx ,
& flattened_query_tree ,
& allowed_candidates ,
) ? ;
2021-10-04 16:36:11 +02:00
self . linear_buckets . get_or_insert ( new_buckets . into_iter ( ) )
2021-06-16 18:33:33 +02:00
}
2021-03-31 19:23:02 +02:00
} ;
2021-03-11 17:31:02 +01:00
2021-10-04 16:36:11 +02:00
match linear_buckets . next ( ) {
2021-03-31 19:23:02 +02:00
Some ( ( _score , candidates ) ) = > candidates ,
None = > {
return Ok ( Some ( CriterionResult {
2021-05-05 20:46:56 +02:00
query_tree : Some ( query_tree ) ,
candidates : Some ( RoaringBitmap ::new ( ) ) ,
2021-05-10 12:33:37 +02:00
filtered_candidates : None ,
2021-05-05 20:46:56 +02:00
bucket_candidates : Some ( take ( & mut self . bucket_candidates ) ) ,
2021-03-31 19:23:02 +02:00
} ) ) ;
2021-06-16 18:33:33 +02:00
}
2021-03-31 19:23:02 +02:00
}
} else {
2021-10-04 16:36:11 +02:00
let mut set_buckets = match self . set_buckets . as_mut ( ) {
Some ( set_buckets ) = > set_buckets ,
None = > {
let new_buckets = initialize_set_buckets (
self . ctx ,
& flattened_query_tree ,
& allowed_candidates ,
params . wdcache ,
) ? ;
self . set_buckets . get_or_insert ( new_buckets )
}
} ;
match set_compute_candidates ( & mut set_buckets , & allowed_candidates ) ? {
Some ( ( _score , candidates ) ) = > candidates ,
2021-04-06 15:03:41 +02:00
None = > {
return Ok ( Some ( CriterionResult {
2021-05-05 20:46:56 +02:00
query_tree : Some ( query_tree ) ,
candidates : Some ( RoaringBitmap ::new ( ) ) ,
2021-05-10 12:33:37 +02:00
filtered_candidates : None ,
2021-05-05 20:46:56 +02:00
bucket_candidates : Some ( take ( & mut self . bucket_candidates ) ) ,
2021-04-06 15:03:41 +02:00
} ) ) ;
2021-06-16 18:33:33 +02:00
}
2021-04-06 15:03:41 +02:00
}
2021-03-11 17:31:02 +01:00
} ;
2021-05-05 20:46:56 +02:00
allowed_candidates - = & found_candidates ;
2021-06-16 18:33:33 +02:00
self . state =
Some ( ( query_tree . clone ( ) , flattened_query_tree , allowed_candidates ) ) ;
2021-03-23 15:25:46 +01:00
2021-03-11 17:31:02 +01:00
return Ok ( Some ( CriterionResult {
2021-05-05 20:46:56 +02:00
query_tree : Some ( query_tree ) ,
2021-03-11 17:31:02 +01:00
candidates : Some ( found_candidates ) ,
2021-05-10 12:33:37 +02:00
filtered_candidates : None ,
2021-05-05 20:46:56 +02:00
bucket_candidates : Some ( take ( & mut self . bucket_candidates ) ) ,
2021-03-11 17:31:02 +01:00
} ) ) ;
2021-06-16 18:33:33 +02:00
}
None = > match self . parent . next ( params ) ? {
Some ( CriterionResult {
query_tree : Some ( query_tree ) ,
candidates ,
filtered_candidates ,
bucket_candidates ,
} ) = > {
let mut candidates = match candidates {
Some ( candidates ) = > candidates ,
None = > {
resolve_query_tree ( self . ctx , & query_tree , params . wdcache ) ?
- params . excluded_candidates
2021-05-10 12:33:37 +02:00
}
2021-06-16 18:33:33 +02:00
} ;
2021-05-10 12:33:37 +02:00
2021-06-16 18:33:33 +02:00
if let Some ( filtered_candidates ) = filtered_candidates {
candidates & = filtered_candidates ;
}
2021-05-05 20:46:56 +02:00
2021-06-16 18:33:33 +02:00
let flattened_query_tree = flatten_query_tree ( & query_tree ) ;
2021-05-05 20:46:56 +02:00
2021-06-16 18:33:33 +02:00
match bucket_candidates {
Some ( bucket_candidates ) = > self . bucket_candidates | = bucket_candidates ,
None = > self . bucket_candidates | = & candidates ,
}
self . state = Some ( ( query_tree , flattened_query_tree , candidates ) ) ;
2021-10-04 16:36:11 +02:00
self . linear_buckets = None ;
2021-06-16 18:33:33 +02:00
}
Some ( CriterionResult {
query_tree : None ,
candidates ,
filtered_candidates ,
bucket_candidates ,
} ) = > {
return Ok ( Some ( CriterionResult {
query_tree : None ,
candidates ,
filtered_candidates ,
bucket_candidates ,
} ) ) ;
2021-03-11 17:31:02 +01:00
}
2021-06-16 18:33:33 +02:00
None = > return Ok ( None ) ,
2021-03-11 17:31:02 +01:00
} ,
}
}
2021-03-11 11:48:55 +01:00
}
}
2021-10-05 11:18:42 +02:00
/// QueryPositionIterator is an Iterator over positions of a Query,
/// It contains iterators over words positions.
struct QueryPositionIterator < ' t > {
inner :
Vec < Peekable < Box < dyn Iterator < Item = heed ::Result < ( ( & ' t str , u32 ) , RoaringBitmap ) > > + ' t > > > ,
2021-03-31 19:23:02 +02:00
}
2021-10-05 11:18:42 +02:00
impl < ' t > QueryPositionIterator < ' t > {
2021-06-14 16:46:19 +02:00
fn new (
ctx : & ' t dyn Context < ' t > ,
2021-10-04 16:36:11 +02:00
queries : & [ Query ] ,
2021-06-14 16:46:19 +02:00
wdcache : & mut WordDerivationsCache ,
2021-10-04 16:36:11 +02:00
) -> Result < Self > {
2021-03-31 19:23:02 +02:00
let mut inner = Vec ::with_capacity ( queries . len ( ) ) ;
for query in queries {
2021-10-04 16:36:11 +02:00
let in_prefix_cache = query . prefix & & ctx . in_prefix_cache ( query . kind . word ( ) ) ;
2021-04-01 14:42:23 +02:00
match & query . kind {
QueryKind ::Exact { word , .. } = > {
2021-10-04 16:36:11 +02:00
if ! query . prefix | | in_prefix_cache {
2021-10-06 11:12:26 +02:00
let word = query . kind . word ( ) ;
let iter = ctx . word_position_iterator ( word , in_prefix_cache ) ? ;
2021-10-04 16:36:11 +02:00
inner . push ( iter . peekable ( ) ) ;
2021-04-01 14:42:23 +02:00
} else {
2021-06-16 18:33:33 +02:00
for ( word , _ ) in word_derivations ( & word , true , 0 , ctx . words_fst ( ) , wdcache ) ?
{
2021-10-05 11:18:42 +02:00
let iter = ctx . word_position_iterator ( & word , in_prefix_cache ) ? ;
2021-10-04 16:36:11 +02:00
inner . push ( iter . peekable ( ) ) ;
2021-04-01 14:42:23 +02:00
}
}
2021-06-16 18:33:33 +02:00
}
2021-04-01 14:42:23 +02:00
QueryKind ::Tolerant { typo , word } = > {
2021-06-16 18:33:33 +02:00
for ( word , _ ) in
word_derivations ( & word , query . prefix , * typo , ctx . words_fst ( ) , wdcache ) ?
{
2021-10-05 11:18:42 +02:00
let iter = ctx . word_position_iterator ( & word , in_prefix_cache ) ? ;
2021-10-04 16:36:11 +02:00
inner . push ( iter . peekable ( ) ) ;
2021-04-01 14:42:23 +02:00
}
}
2021-10-04 16:36:11 +02:00
} ;
2021-03-31 19:23:02 +02:00
}
2021-10-04 16:36:11 +02:00
Ok ( Self { inner } )
2021-03-31 19:23:02 +02:00
}
2021-10-04 16:36:11 +02:00
}
2021-03-31 19:23:02 +02:00
2021-10-05 11:18:42 +02:00
impl < ' t > Iterator for QueryPositionIterator < ' t > {
2021-10-04 16:36:11 +02:00
type Item = heed ::Result < ( u32 , RoaringBitmap ) > ;
fn next ( & mut self ) -> Option < Self ::Item > {
2021-10-05 17:35:07 +02:00
// sort inner words from the closest next position to the farthest next position.
2021-10-04 16:36:11 +02:00
let expected_pos = self
. inner
. iter_mut ( )
. filter_map ( | wli | match wli . peek ( ) {
2021-10-05 11:18:42 +02:00
Some ( Ok ( ( ( _ , pos ) , _ ) ) ) = > Some ( * pos ) ,
2021-10-04 16:36:11 +02:00
_ = > None ,
} )
. min ( ) ? ;
let mut candidates = None ;
2021-03-31 19:23:02 +02:00
for wli in self . inner . iter_mut ( ) {
2021-10-05 11:18:42 +02:00
if let Some ( Ok ( ( ( _ , pos ) , _ ) ) ) = wli . peek ( ) {
2021-10-04 16:36:11 +02:00
if * pos > expected_pos {
continue ;
2021-03-31 19:23:02 +02:00
}
}
2021-10-04 16:36:11 +02:00
match wli . next ( ) {
Some ( Ok ( ( _ , docids ) ) ) = > {
candidates = match candidates . take ( ) {
Some ( candidates ) = > Some ( candidates | docids ) ,
None = > Some ( docids ) ,
2021-06-16 18:33:33 +02:00
}
2021-04-15 12:22:44 +02:00
}
2021-10-04 16:36:11 +02:00
Some ( Err ( e ) ) = > return Some ( Err ( e ) ) ,
None = > continue ,
2021-03-31 19:23:02 +02:00
}
}
2021-10-04 16:36:11 +02:00
candidates . map ( | candidates | Ok ( ( expected_pos , candidates ) ) )
}
2021-04-28 13:53:27 +02:00
}
/// A Branch is represent a possible alternative of the original query and is build with the Query Tree,
2021-10-05 11:18:42 +02:00
/// This branch allows us to iterate over meta-interval of positions.
2021-10-04 16:36:11 +02:00
struct Branch < ' t > {
2021-10-05 11:18:42 +02:00
query_level_iterator : Vec < ( u32 , RoaringBitmap , Peekable < QueryPositionIterator < ' t > > ) > ,
2021-10-04 16:36:11 +02:00
last_result : ( u32 , RoaringBitmap ) ,
2021-03-31 19:23:02 +02:00
branch_size : u32 ,
}
2021-10-04 16:36:11 +02:00
impl < ' t > Branch < ' t > {
fn new (
ctx : & ' t dyn Context < ' t > ,
flatten_branch : & [ Vec < Query > ] ,
wdcache : & mut WordDerivationsCache ,
allowed_candidates : & RoaringBitmap ,
) -> Result < Self > {
let mut query_level_iterator = Vec ::new ( ) ;
for queries in flatten_branch {
2021-10-05 11:18:42 +02:00
let mut qli = QueryPositionIterator ::new ( ctx , queries , wdcache ) ? . peekable ( ) ;
2021-10-04 16:36:11 +02:00
let ( pos , docids ) = qli . next ( ) . transpose ( ) ? . unwrap_or ( ( 0 , RoaringBitmap ::new ( ) ) ) ;
query_level_iterator . push ( ( pos , docids & allowed_candidates , qli ) ) ;
}
let mut branch = Self {
query_level_iterator ,
last_result : ( 0 , RoaringBitmap ::new ( ) ) ,
branch_size : flatten_branch . len ( ) as u32 ,
} ;
branch . update_last_result ( ) ;
Ok ( branch )
}
2021-04-28 13:53:27 +02:00
/// return the next meta-interval of the branch,
/// and update inner interval in order to be ranked by the BinaryHeap.
2021-04-15 12:22:44 +02:00
fn next ( & mut self , allowed_candidates : & RoaringBitmap ) -> heed ::Result < bool > {
2021-10-04 16:36:11 +02:00
// update the first query.
let index = self . lowest_iterator_index ( ) ;
match self . query_level_iterator . get_mut ( index ) {
Some ( ( cur_pos , cur_docids , qli ) ) = > match qli . next ( ) . transpose ( ) ? {
Some ( ( next_pos , next_docids ) ) = > {
* cur_pos = next_pos ;
* cur_docids | = next_docids & allowed_candidates ;
2021-10-06 11:12:26 +02:00
self . update_last_result ( ) ;
Ok ( true )
2021-10-04 16:36:11 +02:00
}
2021-10-06 11:12:26 +02:00
None = > Ok ( false ) ,
2021-10-04 16:36:11 +02:00
} ,
2021-10-06 11:12:26 +02:00
None = > Ok ( false ) ,
2021-04-13 15:06:12 +02:00
}
}
2021-10-04 16:36:11 +02:00
fn lowest_iterator_index ( & mut self ) -> usize {
let ( index , _ ) = self
. query_level_iterator
. iter_mut ( )
. map ( | ( pos , docids , qli ) | {
if docids . is_empty ( ) {
0
} else {
2021-10-05 17:35:07 +02:00
match qli . peek ( ) {
Some ( result ) = > {
2021-10-04 16:36:11 +02:00
result . as_ref ( ) . map ( | ( next_pos , _ ) | * next_pos - * pos ) . unwrap_or ( 0 )
2021-10-05 17:35:07 +02:00
}
None = > u32 ::MAX ,
}
2021-10-04 16:36:11 +02:00
}
} )
. enumerate ( )
. min_by_key ( | ( _ , diff ) | * diff )
. unwrap_or ( ( 0 , 0 ) ) ;
index
2021-04-15 12:22:44 +02:00
}
2021-10-04 16:36:11 +02:00
fn update_last_result ( & mut self ) {
let mut result_pos = 0 ;
let mut result_docids = None ;
2021-04-15 12:22:44 +02:00
2021-10-04 16:36:11 +02:00
for ( pos , docids , _qli ) in self . query_level_iterator . iter ( ) {
result_pos + = pos ;
result_docids = result_docids
. take ( )
. map_or_else ( | | Some ( docids . clone ( ) ) , | candidates | Some ( candidates & docids ) ) ;
}
// remove last result docids from inner iterators
if let Some ( docids ) = result_docids . as_ref ( ) {
for ( _ , query_docids , _ ) in self . query_level_iterator . iter_mut ( ) {
* query_docids - = docids ;
}
}
self . last_result = ( result_pos , result_docids . unwrap_or_default ( ) ) ;
2021-04-15 12:22:44 +02:00
}
2021-04-28 13:53:27 +02:00
/// return the score of the current inner interval.
2021-04-13 15:06:12 +02:00
fn compute_rank ( & self ) -> u32 {
2021-10-04 16:36:11 +02:00
// we compute a rank from the position.
let ( pos , _ ) = self . last_result ;
pos . saturating_sub ( ( 0 .. self . branch_size ) . sum ( ) ) * LCM_10_FIRST_NUMBERS / self . branch_size
2021-04-13 15:06:12 +02:00
}
fn cmp ( & self , other : & Self ) -> Ordering {
let self_rank = self . compute_rank ( ) ;
let other_rank = other . compute_rank ( ) ;
2021-04-12 11:19:25 +02:00
2021-10-04 16:36:11 +02:00
// lower rank is better, and because BinaryHeap give the higher ranked branch, we reverse it.
self_rank . cmp ( & other_rank ) . reverse ( )
2021-03-31 19:23:02 +02:00
}
}
2021-10-04 16:36:11 +02:00
impl < ' t > Ord for Branch < ' t > {
2021-03-31 19:23:02 +02:00
fn cmp ( & self , other : & Self ) -> Ordering {
self . cmp ( other )
}
}
2021-10-04 16:36:11 +02:00
impl < ' t > PartialOrd for Branch < ' t > {
2021-03-31 19:23:02 +02:00
fn partial_cmp ( & self , other : & Self ) -> Option < Ordering > {
Some ( self . cmp ( other ) )
}
}
2021-10-04 16:36:11 +02:00
impl < ' t > PartialEq for Branch < ' t > {
2021-03-31 19:23:02 +02:00
fn eq ( & self , other : & Self ) -> bool {
self . cmp ( other ) = = Ordering ::Equal
}
}
2021-10-04 16:36:11 +02:00
impl < ' t > Eq for Branch < ' t > { }
2021-03-31 19:23:02 +02:00
2021-10-04 16:36:11 +02:00
fn initialize_set_buckets < ' t > (
2021-03-31 19:23:02 +02:00
ctx : & ' t dyn Context < ' t > ,
2021-10-04 16:36:11 +02:00
branches : & FlattenedQueryTree ,
2021-04-15 12:22:44 +02:00
allowed_candidates : & RoaringBitmap ,
2021-04-01 14:42:23 +02:00
wdcache : & mut WordDerivationsCache ,
2021-10-04 16:36:11 +02:00
) -> Result < BinaryHeap < Branch < ' t > > > {
let mut heap = BinaryHeap ::new ( ) ;
for flatten_branch in branches {
let branch = Branch ::new ( ctx , flatten_branch , wdcache , allowed_candidates ) ? ;
heap . push ( branch ) ;
2021-03-31 19:23:02 +02:00
}
2021-10-04 16:36:11 +02:00
Ok ( heap )
2021-03-31 19:23:02 +02:00
}
2021-10-04 16:36:11 +02:00
fn set_compute_candidates (
branches_heap : & mut BinaryHeap < Branch > ,
2021-03-31 19:23:02 +02:00
allowed_candidates : & RoaringBitmap ,
2021-10-04 16:36:11 +02:00
) -> Result < Option < ( u32 , RoaringBitmap ) > > {
2021-04-13 15:06:12 +02:00
let mut final_candidates : Option < ( u32 , RoaringBitmap ) > = None ;
2021-04-15 12:22:44 +02:00
let mut allowed_candidates = allowed_candidates . clone ( ) ;
2021-03-31 19:23:02 +02:00
while let Some ( mut branch ) = branches_heap . peek_mut ( ) {
2021-04-15 12:22:44 +02:00
// if current is worst than best we break to return
// candidates that correspond to the best rank
2021-10-04 16:36:11 +02:00
let branch_rank = branch . compute_rank ( ) ;
2021-04-27 17:39:23 +02:00
if let Some ( ( best_rank , _ ) ) = final_candidates {
2021-06-16 18:33:33 +02:00
if branch_rank > best_rank {
break ;
}
2021-04-27 17:39:23 +02:00
}
2021-10-04 16:36:11 +02:00
let candidates = take ( & mut branch . last_result . 1 ) ;
2021-04-12 11:19:25 +02:00
if candidates . is_empty ( ) {
// we don't have candidates, get next interval.
2021-06-16 18:33:33 +02:00
if ! branch . next ( & allowed_candidates ) ? {
PeekMut ::pop ( branch ) ;
}
2021-10-04 16:36:11 +02:00
} else {
2021-04-15 12:22:44 +02:00
allowed_candidates - = & candidates ;
2021-04-13 15:06:12 +02:00
final_candidates = match final_candidates . take ( ) {
2021-04-15 12:22:44 +02:00
// we add current candidates to best candidates
2021-04-13 15:06:12 +02:00
Some ( ( best_rank , mut best_candidates ) ) = > {
2021-04-15 12:22:44 +02:00
best_candidates | = candidates ;
2021-10-04 16:36:11 +02:00
branch . next ( & allowed_candidates ) ? ;
2021-04-15 12:22:44 +02:00
Some ( ( best_rank , best_candidates ) )
2021-06-16 18:33:33 +02:00
}
2021-04-13 15:06:12 +02:00
// we take current candidates as best candidates
None = > {
2021-10-04 16:36:11 +02:00
branch . next ( & allowed_candidates ) ? ;
2021-04-13 15:06:12 +02:00
Some ( ( branch_rank , candidates ) )
2021-06-16 18:33:33 +02:00
}
2021-04-13 15:06:12 +02:00
} ;
2021-03-31 19:23:02 +02:00
}
}
2021-10-04 16:36:11 +02:00
Ok ( final_candidates )
2021-03-31 19:23:02 +02:00
}
2021-10-04 16:36:11 +02:00
fn initialize_linear_buckets (
2021-03-11 17:31:02 +01:00
ctx : & dyn Context ,
2021-05-05 20:46:56 +02:00
branches : & FlattenedQueryTree ,
2021-03-11 17:31:02 +01:00
allowed_candidates : & RoaringBitmap ,
2021-06-16 18:33:33 +02:00
) -> Result < BTreeMap < u64 , RoaringBitmap > > {
fn compute_candidate_rank (
branches : & FlattenedQueryTree ,
words_positions : HashMap < String , RoaringBitmap > ,
) -> u64 {
2021-03-11 17:31:02 +01:00
let mut min_rank = u64 ::max_value ( ) ;
for branch in branches {
2021-03-24 18:20:13 +01:00
let branch_len = branch . len ( ) ;
let mut branch_rank = Vec ::with_capacity ( branch_len ) ;
2021-03-29 16:25:14 +02:00
for derivates in branch {
let mut position = None ;
for Query { prefix , kind } in derivates {
// find the best position of the current word in the document.
let current_position = match kind {
QueryKind ::Exact { word , .. } = > {
if * prefix {
word_derivations ( word , true , 0 , & words_positions )
2021-06-16 18:33:33 +02:00
. flat_map ( | positions | positions . iter ( ) . next ( ) )
. min ( )
2021-03-29 16:25:14 +02:00
} else {
2021-06-16 18:33:33 +02:00
words_positions
. get ( word )
2021-03-29 16:25:14 +02:00
. map ( | positions | positions . iter ( ) . next ( ) )
. flatten ( )
}
2021-06-16 18:33:33 +02:00
}
2021-03-29 16:25:14 +02:00
QueryKind ::Tolerant { typo , word } = > {
word_derivations ( word , * prefix , * typo , & words_positions )
2021-06-16 18:33:33 +02:00
. flat_map ( | positions | positions . iter ( ) . next ( ) )
. min ( )
}
2021-03-29 16:25:14 +02:00
} ;
match ( position , current_position ) {
( Some ( p ) , Some ( cp ) ) = > position = Some ( cmp ::min ( p , cp ) ) ,
( None , Some ( cp ) ) = > position = Some ( cp ) ,
_ = > ( ) ,
}
}
2021-03-11 17:31:02 +01:00
// if a position is found, we add it to the branch score,
// otherwise the branch is considered as unfindable in this document and we break.
if let Some ( position ) = position {
2021-03-24 18:20:13 +01:00
branch_rank . push ( position as u64 ) ;
2021-03-11 17:31:02 +01:00
} else {
2021-03-24 18:20:13 +01:00
branch_rank . clear ( ) ;
2021-03-11 17:31:02 +01:00
break ;
}
}
2021-03-24 18:20:13 +01:00
if ! branch_rank . is_empty ( ) {
branch_rank . sort_unstable ( ) ;
// because several words in same query can't match all a the position 0,
// we substract the word index to the position.
2021-06-16 18:33:33 +02:00
let branch_rank : u64 =
branch_rank . into_iter ( ) . enumerate ( ) . map ( | ( i , r ) | r - i as u64 ) . sum ( ) ;
2021-03-24 18:20:13 +01:00
// here we do the means of the words of the branch
2021-06-16 18:33:33 +02:00
min_rank =
min_rank . min ( branch_rank * LCM_10_FIRST_NUMBERS as u64 / branch_len as u64 ) ;
2021-03-24 18:20:13 +01:00
}
2021-03-11 17:31:02 +01:00
}
min_rank
}
fn word_derivations < ' a > (
word : & str ,
is_prefix : bool ,
max_typo : u8 ,
words_positions : & ' a HashMap < String , RoaringBitmap > ,
2021-06-16 18:33:33 +02:00
) -> impl Iterator < Item = & ' a RoaringBitmap > {
2021-03-11 17:31:02 +01:00
let dfa = build_dfa ( word , max_typo , is_prefix ) ;
words_positions . iter ( ) . filter_map ( move | ( document_word , positions ) | {
use levenshtein_automata ::Distance ;
match dfa . eval ( document_word ) {
Distance ::Exact ( _ ) = > Some ( positions ) ,
Distance ::AtLeast ( _ ) = > None ,
}
} )
}
let mut candidates = BTreeMap ::new ( ) ;
for docid in allowed_candidates {
let words_positions = ctx . docid_words_positions ( docid ) ? ;
let rank = compute_candidate_rank ( branches , words_positions ) ;
candidates . entry ( rank ) . or_insert_with ( RoaringBitmap ::new ) . insert ( docid ) ;
}
Ok ( candidates )
}
2021-03-11 11:48:55 +01:00
// TODO can we keep refs of Query
2021-05-05 20:46:56 +02:00
fn flatten_query_tree ( query_tree : & Operation ) -> FlattenedQueryTree {
2021-06-09 17:28:12 +02:00
use crate ::search ::criteria ::Operation ::{ And , Or , Phrase } ;
2021-03-11 11:48:55 +01:00
2021-05-05 20:46:56 +02:00
fn and_recurse ( head : & Operation , tail : & [ Operation ] ) -> FlattenedQueryTree {
2021-03-11 11:48:55 +01:00
match tail . split_first ( ) {
Some ( ( thead , tail ) ) = > {
let tail = and_recurse ( thead , tail ) ;
let mut out = Vec ::new ( ) ;
for array in recurse ( head ) {
for tail_array in & tail {
let mut array = array . clone ( ) ;
array . extend ( tail_array . iter ( ) . cloned ( ) ) ;
out . push ( array ) ;
}
}
out
2021-06-16 18:33:33 +02:00
}
2021-03-11 11:48:55 +01:00
None = > recurse ( head ) ,
}
}
2021-05-05 20:46:56 +02:00
fn recurse ( op : & Operation ) -> FlattenedQueryTree {
2021-03-11 11:48:55 +01:00
match op {
2021-06-16 18:33:33 +02:00
And ( ops ) = > ops . split_first ( ) . map_or_else ( Vec ::new , | ( h , t ) | and_recurse ( h , t ) ) ,
Or ( _ , ops ) = > {
if ops . iter ( ) . all ( | op | op . query ( ) . is_some ( ) ) {
vec! [ vec! [ ops . iter ( ) . flat_map ( | op | op . query ( ) ) . cloned ( ) . collect ( ) ] ]
} else {
ops . iter ( ) . map ( recurse ) . flatten ( ) . collect ( )
}
}
2021-06-09 17:28:12 +02:00
Phrase ( words ) = > {
2021-06-16 18:33:33 +02:00
let queries = words
. iter ( )
. map ( | word | vec! [ Query { prefix : false , kind : QueryKind ::exact ( word . clone ( ) ) } ] )
. collect ( ) ;
2021-06-09 17:28:12 +02:00
vec! [ queries ]
}
2021-03-29 16:25:14 +02:00
Operation ::Query ( query ) = > vec! [ vec! [ vec! [ query . clone ( ) ] ] ] ,
2021-03-11 11:48:55 +01:00
}
}
recurse ( query_tree )
}
#[ cfg(test) ]
mod tests {
use big_s ::S ;
use super ::* ;
2021-06-16 18:33:33 +02:00
use crate ::search ::criteria ::QueryKind ;
2021-03-11 11:48:55 +01:00
#[ test ]
2021-03-11 17:31:02 +01:00
fn simple_flatten_query_tree ( ) {
2021-06-16 18:33:33 +02:00
let query_tree = Operation ::Or (
false ,
vec! [
Operation ::Query ( Query { prefix : false , kind : QueryKind ::exact ( S ( " manythefish " ) ) } ) ,
Operation ::And ( vec! [
Operation ::Query ( Query { prefix : false , kind : QueryKind ::exact ( S ( " manythe " ) ) } ) ,
Operation ::Query ( Query { prefix : false , kind : QueryKind ::exact ( S ( " fish " ) ) } ) ,
2021-03-11 11:48:55 +01:00
] ) ,
2021-06-16 18:33:33 +02:00
Operation ::And ( vec! [
Operation ::Query ( Query { prefix : false , kind : QueryKind ::exact ( S ( " many " ) ) } ) ,
Operation ::Or (
false ,
vec! [
Operation ::Query ( Query {
prefix : false ,
kind : QueryKind ::exact ( S ( " thefish " ) ) ,
} ) ,
Operation ::And ( vec! [
Operation ::Query ( Query {
prefix : false ,
kind : QueryKind ::exact ( S ( " the " ) ) ,
} ) ,
Operation ::Query ( Query {
prefix : false ,
kind : QueryKind ::exact ( S ( " fish " ) ) ,
} ) ,
] ) ,
] ,
) ,
] ) ,
] ,
) ;
2021-03-11 11:48:55 +01:00
let expected = vec! [
2021-03-29 16:25:14 +02:00
vec! [ vec! [ Query { prefix : false , kind : QueryKind ::exact ( S ( " manythefish " ) ) } ] ] ,
2021-03-11 11:48:55 +01:00
vec! [
2021-03-29 16:25:14 +02:00
vec! [ Query { prefix : false , kind : QueryKind ::exact ( S ( " manythe " ) ) } ] ,
vec! [ Query { prefix : false , kind : QueryKind ::exact ( S ( " fish " ) ) } ] ,
2021-03-11 11:48:55 +01:00
] ,
vec! [
2021-03-29 16:25:14 +02:00
vec! [ Query { prefix : false , kind : QueryKind ::exact ( S ( " many " ) ) } ] ,
vec! [ Query { prefix : false , kind : QueryKind ::exact ( S ( " thefish " ) ) } ] ,
2021-03-11 11:48:55 +01:00
] ,
vec! [
2021-03-29 16:25:14 +02:00
vec! [ Query { prefix : false , kind : QueryKind ::exact ( S ( " many " ) ) } ] ,
vec! [ Query { prefix : false , kind : QueryKind ::exact ( S ( " the " ) ) } ] ,
vec! [ Query { prefix : false , kind : QueryKind ::exact ( S ( " fish " ) ) } ] ,
2021-03-11 11:48:55 +01:00
] ,
] ;
2021-03-11 17:31:02 +01:00
let result = flatten_query_tree ( & query_tree ) ;
2021-03-11 11:48:55 +01:00
assert_eq! ( expected , result ) ;
}
}