2022-04-04 17:52:35 +08:00
use std ::borrow ::Cow ;
2022-09-01 18:10:47 +08:00
use std ::cmp ::max ;
2022-10-31 20:33:49 +08:00
use std ::collections ::hash_map ::Entry ;
use std ::collections ::HashMap ;
use std ::hash ::Hash ;
use std ::rc ::Rc ;
2022-10-12 15:48:23 +08:00
use std ::{ fmt , mem } ;
2021-03-03 19:03:31 +08:00
2022-06-02 21:47:28 +08:00
use charabia ::classifier ::ClassifiedTokenIter ;
use charabia ::{ SeparatorKind , TokenKind } ;
2021-03-03 19:03:31 +08:00
use roaring ::RoaringBitmap ;
use slice_group_by ::GroupBy ;
2022-04-05 00:56:59 +08:00
use crate ::search ::matches ::matching_words ::{ MatchingWord , PrimitiveWordId } ;
2022-08-18 23:36:08 +08:00
use crate ::search ::TermsMatchingStrategy ;
2022-10-12 15:48:23 +08:00
use crate ::{ CboRoaringBitmapLenCodec , Index , MatchingWords , Result } ;
2021-03-03 19:03:31 +08:00
type IsOptionalWord = bool ;
type IsPrefix = bool ;
#[ derive(Clone, PartialEq, Eq, Hash) ]
pub enum Operation {
And ( Vec < Operation > ) ,
2022-10-26 21:38:06 +08:00
// series of consecutive non prefix and exact words
// `None` means a stop word.
Phrase ( Vec < Option < String > > ) ,
2021-03-03 19:03:31 +08:00
Or ( IsOptionalWord , Vec < Operation > ) ,
Query ( Query ) ,
}
impl fmt ::Debug for Operation {
fn fmt ( & self , f : & mut fmt ::Formatter < '_ > ) -> fmt ::Result {
fn pprint_tree ( f : & mut fmt ::Formatter < '_ > , op : & Operation , depth : usize ) -> fmt ::Result {
match op {
Operation ::And ( children ) = > {
writeln! ( f , " {:1$}AND " , " " , depth * 2 ) ? ;
children . iter ( ) . try_for_each ( | c | pprint_tree ( f , c , depth + 1 ) )
2021-06-17 00:33:33 +08:00
}
2021-06-09 23:28:12 +08:00
Operation ::Phrase ( children ) = > {
writeln! ( f , " {:2$}PHRASE {:?} " , " " , children , depth * 2 )
2021-06-17 00:33:33 +08:00
}
2021-03-03 19:03:31 +08:00
Operation ::Or ( true , children ) = > {
writeln! ( f , " {:1$}OR(WORD) " , " " , depth * 2 ) ? ;
children . iter ( ) . try_for_each ( | c | pprint_tree ( f , c , depth + 1 ) )
2021-06-17 00:33:33 +08:00
}
2021-03-03 19:03:31 +08:00
Operation ::Or ( false , children ) = > {
writeln! ( f , " {:1$}OR " , " " , depth * 2 ) ? ;
children . iter ( ) . try_for_each ( | c | pprint_tree ( f , c , depth + 1 ) )
2021-06-17 00:33:33 +08:00
}
2021-03-03 19:03:31 +08:00
Operation ::Query ( query ) = > writeln! ( f , " {:2$}{:?} " , " " , query , depth * 2 ) ,
}
}
pprint_tree ( f , self , 0 )
}
}
impl Operation {
fn and ( mut ops : Vec < Self > ) -> Self {
if ops . len ( ) = = 1 {
ops . pop ( ) . unwrap ( )
} else {
Self ::And ( ops )
}
}
pub fn or ( word_branch : IsOptionalWord , mut ops : Vec < Self > ) -> Self {
if ops . len ( ) = = 1 {
ops . pop ( ) . unwrap ( )
} else {
2022-08-18 23:36:08 +08:00
let ops = ops
. into_iter ( )
. flat_map ( | o | match o {
Operation ::Or ( wb , children ) if wb = = word_branch = > children ,
op = > vec! [ op ] ,
} )
. collect ( ) ;
2021-03-03 19:03:31 +08:00
Self ::Or ( word_branch , ops )
}
}
2022-10-26 21:38:06 +08:00
fn phrase ( mut words : Vec < Option < String > > ) -> Self {
2021-06-09 23:28:12 +08:00
if words . len ( ) = = 1 {
2022-10-26 21:38:06 +08:00
if let Some ( word ) = words . pop ( ) . unwrap ( ) {
Self ::Query ( Query { prefix : false , kind : QueryKind ::exact ( word ) } )
} else {
Self ::Phrase ( words )
}
2021-03-03 19:03:31 +08:00
} else {
2021-06-09 23:28:12 +08:00
Self ::Phrase ( words )
2021-03-03 19:03:31 +08:00
}
}
2021-02-23 00:17:01 +08:00
pub fn query ( & self ) -> Option < & Query > {
match self {
Operation ::Query ( query ) = > Some ( query ) ,
_ = > None ,
}
}
2021-03-03 19:03:31 +08:00
}
#[ derive(Clone, Eq, PartialEq, Hash) ]
pub struct Query {
pub prefix : IsPrefix ,
pub kind : QueryKind ,
}
#[ derive(Debug, Clone, PartialEq, Eq, Hash) ]
pub enum QueryKind {
Tolerant { typo : u8 , word : String } ,
Exact { original_typo : u8 , word : String } ,
}
impl QueryKind {
2021-02-24 17:25:22 +08:00
pub fn exact ( word : String ) -> Self {
2021-03-03 19:03:31 +08:00
QueryKind ::Exact { original_typo : 0 , word }
}
2021-02-24 17:25:22 +08:00
pub fn tolerant ( typo : u8 , word : String ) -> Self {
2021-03-03 19:03:31 +08:00
QueryKind ::Tolerant { typo , word }
}
pub fn typo ( & self ) -> u8 {
match self {
QueryKind ::Tolerant { typo , .. } = > * typo ,
QueryKind ::Exact { original_typo , .. } = > * original_typo ,
}
}
pub fn word ( & self ) -> & str {
match self {
QueryKind ::Tolerant { word , .. } = > word ,
QueryKind ::Exact { word , .. } = > word ,
}
}
}
impl fmt ::Debug for Query {
fn fmt ( & self , f : & mut fmt ::Formatter < '_ > ) -> fmt ::Result {
let Query { prefix , kind } = self ;
let prefix = if * prefix { String ::from ( " Prefix " ) } else { String ::default ( ) } ;
match kind {
QueryKind ::Exact { word , .. } = > {
f . debug_struct ( & ( prefix + " Exact " ) ) . field ( " word " , & word ) . finish ( )
2021-06-17 00:33:33 +08:00
}
QueryKind ::Tolerant { typo , word } = > f
. debug_struct ( & ( prefix + " Tolerant " ) )
. field ( " word " , & word )
. field ( " max typo " , & typo )
. finish ( ) ,
2021-03-03 19:03:31 +08:00
}
}
}
trait Context {
fn word_docids ( & self , word : & str ) -> heed ::Result < Option < RoaringBitmap > > ;
2021-04-10 03:56:20 +08:00
fn synonyms < S : AsRef < str > > ( & self , words : & [ S ] ) -> heed ::Result < Option < Vec < Vec < String > > > > ;
2022-10-13 15:44:27 +08:00
fn word_documents_count ( & self , word : & str ) -> heed ::Result < Option < u64 > > {
match self . word_docids ( word ) ? {
Some ( rb ) = > Ok ( Some ( rb . len ( ) ) ) ,
None = > Ok ( None ) ,
}
}
2022-03-21 20:29:59 +08:00
/// Returns the minimum word len for 1 and 2 typos.
fn min_word_len_for_typo ( & self ) -> heed ::Result < ( u8 , u8 ) > ;
2022-05-24 18:14:55 +08:00
fn exact_words ( & self ) -> Option < & fst ::Set < Cow < [ u8 ] > > > ;
2022-10-12 17:57:56 +08:00
fn word_pair_frequency (
& self ,
left_word : & str ,
right_word : & str ,
2022-10-13 15:51:59 +08:00
proximity : u8 ,
2022-10-13 15:27:50 +08:00
) -> heed ::Result < Option < u64 > > ;
2021-03-03 19:03:31 +08:00
}
/// The query tree builder is the interface to build a query tree.
pub struct QueryTreeBuilder < ' a > {
rtxn : & ' a heed ::RoTxn < ' a > ,
index : & ' a Index ,
2022-08-22 23:37:36 +08:00
terms_matching_strategy : TermsMatchingStrategy ,
2021-02-25 17:49:25 +08:00
authorize_typos : bool ,
2021-04-14 01:10:58 +08:00
words_limit : Option < usize > ,
2022-05-24 15:43:17 +08:00
exact_words : Option < fst ::Set < Cow < ' a , [ u8 ] > > > ,
2021-03-03 19:03:31 +08:00
}
impl < ' a > Context for QueryTreeBuilder < ' a > {
fn word_docids ( & self , word : & str ) -> heed ::Result < Option < RoaringBitmap > > {
self . index . word_docids . get ( self . rtxn , word )
}
2021-04-10 03:56:20 +08:00
fn synonyms < S : AsRef < str > > ( & self , words : & [ S ] ) -> heed ::Result < Option < Vec < Vec < String > > > > {
2021-04-07 16:53:57 +08:00
self . index . words_synonyms ( self . rtxn , words )
2021-02-18 22:06:58 +08:00
}
2022-10-13 15:51:59 +08:00
fn word_documents_count ( & self , word : & str ) -> heed ::Result < Option < u64 > > {
self . index . word_documents_count ( self . rtxn , word )
}
2022-03-21 20:29:59 +08:00
fn min_word_len_for_typo ( & self ) -> heed ::Result < ( u8 , u8 ) > {
2022-10-25 03:34:13 +08:00
let one = self . index . min_word_len_one_typo ( self . rtxn ) ? ;
let two = self . index . min_word_len_two_typos ( self . rtxn ) ? ;
2022-03-21 20:29:59 +08:00
Ok ( ( one , two ) )
}
2022-03-21 23:25:15 +08:00
2022-05-24 18:14:55 +08:00
fn exact_words ( & self ) -> Option < & fst ::Set < Cow < [ u8 ] > > > {
self . exact_words . as_ref ( )
2022-03-21 23:25:15 +08:00
}
2022-10-12 15:48:23 +08:00
2022-10-12 17:57:56 +08:00
fn word_pair_frequency (
& self ,
left_word : & str ,
right_word : & str ,
proximity : u8 ,
) -> heed ::Result < Option < u64 > > {
2022-10-18 16:40:26 +08:00
let key = ( proximity , left_word , right_word ) ;
2022-10-12 17:57:56 +08:00
self . index
. word_pair_proximity_docids
. remap_data_type ::< CboRoaringBitmapLenCodec > ( )
2022-10-25 03:34:13 +08:00
. get ( self . rtxn , & key )
2022-10-12 15:48:23 +08:00
}
2021-03-03 19:03:31 +08:00
}
impl < ' a > QueryTreeBuilder < ' a > {
/// Create a `QueryTreeBuilder` from a heed ReadOnly transaction `rtxn`
/// and an Index `index`.
2022-05-24 15:43:17 +08:00
pub fn new ( rtxn : & ' a heed ::RoTxn < ' a > , index : & ' a Index ) -> Result < Self > {
Ok ( Self {
rtxn ,
index ,
2022-08-22 23:37:36 +08:00
terms_matching_strategy : TermsMatchingStrategy ::default ( ) ,
2022-05-24 15:43:17 +08:00
authorize_typos : true ,
words_limit : None ,
2022-05-24 20:15:33 +08:00
exact_words : index . exact_words ( rtxn ) ? ,
2022-05-24 15:43:17 +08:00
} )
2021-02-25 17:49:25 +08:00
}
2022-08-22 23:37:36 +08:00
/// if `terms_matching_strategy` is set to `All` the query tree will be
2021-02-25 17:49:25 +08:00
/// generated forcing all query words to be present in each matching documents
/// (the criterion `words` will be ignored).
2022-08-22 23:37:36 +08:00
/// default value if not called: `Last`
pub fn terms_matching_strategy (
& mut self ,
terms_matching_strategy : TermsMatchingStrategy ,
) -> & mut Self {
self . terms_matching_strategy = terms_matching_strategy ;
2021-02-25 17:49:25 +08:00
self
}
/// if `authorize_typos` is set to `false` the query tree will be generated
/// forcing all query words to match documents without any typo
/// (the criterion `typo` will be ignored).
/// default value if not called: `true`
pub fn authorize_typos ( & mut self , authorize_typos : bool ) -> & mut Self {
self . authorize_typos = authorize_typos ;
self
2021-03-03 19:03:31 +08:00
}
2021-04-14 01:10:58 +08:00
/// Limit words and phrases that will be taken for query building.
/// Any beyond `words_limit` will be ignored.
pub fn words_limit ( & mut self , words_limit : usize ) -> & mut Self {
self . words_limit = Some ( words_limit ) ;
self
}
2021-03-03 19:03:31 +08:00
/// Build the query tree:
2022-08-22 23:37:36 +08:00
/// - if `terms_matching_strategy` is set to `All` the query tree will be
2021-03-03 19:03:31 +08:00
/// generated forcing all query words to be present in each matching documents
/// (the criterion `words` will be ignored)
/// - if `authorize_typos` is set to `false` the query tree will be generated
/// forcing all query words to match documents without any typo
/// (the criterion `typo` will be ignored)
2022-06-02 21:47:28 +08:00
pub fn build < A : AsRef < [ u8 ] > > (
2022-04-05 00:56:59 +08:00
& self ,
2022-06-02 21:47:28 +08:00
query : ClassifiedTokenIter < A > ,
2022-04-05 00:56:59 +08:00
) -> Result < Option < ( Operation , PrimitiveQuery , MatchingWords ) > > {
2022-10-20 21:05:39 +08:00
let primitive_query = create_primitive_query ( query , self . words_limit ) ;
2021-03-03 19:03:31 +08:00
if ! primitive_query . is_empty ( ) {
2021-06-17 00:33:33 +08:00
let qt = create_query_tree (
self ,
2022-08-22 23:37:36 +08:00
self . terms_matching_strategy ,
2021-06-17 00:33:33 +08:00
self . authorize_typos ,
& primitive_query ,
) ? ;
2022-04-05 00:56:59 +08:00
let matching_words =
create_matching_words ( self , self . authorize_typos , & primitive_query ) ? ;
Ok ( Some ( ( qt , primitive_query , matching_words ) ) )
2021-03-03 19:03:31 +08:00
} else {
Ok ( None )
}
}
}
2022-10-12 16:06:48 +08:00
/// Split the word depending on the frequency of pairs near together in the database documents.
2022-04-07 23:05:44 +08:00
fn split_best_frequency < ' a > (
ctx : & impl Context ,
word : & ' a str ,
2022-10-13 15:44:27 +08:00
) -> heed ::Result < Option < ( & ' a str , & ' a str ) > > {
2021-03-03 19:03:31 +08:00
let chars = word . char_indices ( ) . skip ( 1 ) ;
let mut best = None ;
for ( i , _ ) in chars {
let ( left , right ) = word . split_at ( i ) ;
2022-10-12 15:48:23 +08:00
let pair_freq = ctx . word_pair_frequency ( left , right , 1 ) ? . unwrap_or ( 0 ) ;
2021-03-03 19:03:31 +08:00
2022-10-12 15:48:23 +08:00
if pair_freq ! = 0 & & best . map_or ( true , | ( old , _ , _ ) | pair_freq > old ) {
best = Some ( ( pair_freq , left , right ) ) ;
2021-03-03 19:03:31 +08:00
}
}
2022-10-13 15:44:27 +08:00
Ok ( best . map ( | ( _ , left , right ) | ( left , right ) ) )
2021-03-03 19:03:31 +08:00
}
2022-03-31 19:50:18 +08:00
#[ derive(Clone) ]
2022-03-21 23:25:15 +08:00
pub struct TypoConfig < ' a > {
2022-03-21 20:29:59 +08:00
pub max_typos : u8 ,
2022-04-01 00:23:12 +08:00
pub word_len_one_typo : u8 ,
pub word_len_two_typo : u8 ,
2022-05-24 18:14:55 +08:00
pub exact_words : Option < & ' a fst ::Set < Cow < ' a , [ u8 ] > > > ,
2022-03-21 20:29:59 +08:00
}
2021-03-03 19:03:31 +08:00
/// Return the `QueryKind` of a word depending on `authorize_typos`
/// and the provided word length.
2022-10-25 18:42:38 +08:00
fn typos ( word : String , authorize_typos : bool , config : TypoConfig ) -> QueryKind {
2022-05-24 20:15:33 +08:00
if authorize_typos & & ! config . exact_words . map_or ( false , | s | s . contains ( & word ) ) {
2022-03-21 20:29:59 +08:00
let count = word . chars ( ) . count ( ) . min ( u8 ::MAX as usize ) as u8 ;
2022-04-01 00:37:43 +08:00
if count < config . word_len_one_typo {
2022-03-21 20:29:59 +08:00
QueryKind ::exact ( word )
2022-04-01 00:37:43 +08:00
} else if count < config . word_len_two_typo {
2022-03-21 20:29:59 +08:00
QueryKind ::tolerant ( 1. min ( config . max_typos ) , word )
} else {
QueryKind ::tolerant ( 2. min ( config . max_typos ) , word )
2021-03-03 19:03:31 +08:00
}
} else {
QueryKind ::exact ( word )
}
}
2021-04-10 03:56:20 +08:00
/// Fetch synonyms from the `Context` for the provided word
2021-03-03 19:03:31 +08:00
/// and create the list of operations for the query tree
2021-04-10 03:56:20 +08:00
fn synonyms ( ctx : & impl Context , word : & [ & str ] ) -> heed ::Result < Option < Vec < Operation > > > {
let synonyms = ctx . synonyms ( word ) ? ;
2021-03-03 19:03:31 +08:00
Ok ( synonyms . map ( | synonyms | {
2021-06-17 00:33:33 +08:00
synonyms
. into_iter ( )
. map ( | synonym | {
let words = synonym
. into_iter ( )
. map ( | word | {
Operation ::Query ( Query { prefix : false , kind : QueryKind ::exact ( word ) } )
} )
. collect ( ) ;
Operation ::and ( words )
} )
. collect ( )
2021-03-03 19:03:31 +08:00
} ) )
}
/// Main function that creates the final query tree from the primitive query.
fn create_query_tree (
ctx : & impl Context ,
2022-08-22 23:37:36 +08:00
terms_matching_strategy : TermsMatchingStrategy ,
2021-03-03 19:03:31 +08:00
authorize_typos : bool ,
2021-05-04 19:44:55 +08:00
query : & [ PrimitiveQueryPart ] ,
2021-06-17 00:33:33 +08:00
) -> Result < Operation > {
2021-03-03 19:03:31 +08:00
/// Matches on the `PrimitiveQueryPart` and create an operation from it.
fn resolve_primitive_part (
ctx : & impl Context ,
authorize_typos : bool ,
part : PrimitiveQueryPart ,
2021-06-17 00:33:33 +08:00
) -> Result < Operation > {
2021-03-03 19:03:31 +08:00
match part {
// 1. try to split word in 2
// 2. try to fetch synonyms
// 3. create an operation containing the word
// 4. wrap all in an OR operation
PrimitiveQueryPart ::Word ( word , prefix ) = > {
let mut children = synonyms ( ctx , & [ & word ] ) ? . unwrap_or_default ( ) ;
2022-10-13 15:44:27 +08:00
if let Some ( ( left , right ) ) = split_best_frequency ( ctx , & word ) ? {
2022-10-26 21:38:06 +08:00
children . push ( Operation ::Phrase ( vec! [
Some ( left . to_string ( ) ) ,
Some ( right . to_string ( ) ) ,
] ) ) ;
2021-03-03 19:03:31 +08:00
}
2022-04-01 00:23:12 +08:00
let ( word_len_one_typo , word_len_two_typo ) = ctx . min_word_len_for_typo ( ) ? ;
2022-05-24 15:43:17 +08:00
let exact_words = ctx . exact_words ( ) ;
2022-03-21 23:25:15 +08:00
let config =
TypoConfig { max_typos : 2 , word_len_one_typo , word_len_two_typo , exact_words } ;
2022-01-21 01:34:54 +08:00
children . push ( Operation ::Query ( Query {
prefix ,
2022-03-21 20:29:59 +08:00
kind : typos ( word , authorize_typos , config ) ,
2022-01-21 01:34:54 +08:00
} ) ) ;
2021-03-03 19:03:31 +08:00
Ok ( Operation ::or ( false , children ) )
2021-06-17 00:33:33 +08:00
}
2021-03-03 19:03:31 +08:00
// create a CONSECUTIVE operation wrapping all word in the phrase
2021-06-17 00:33:33 +08:00
PrimitiveQueryPart ::Phrase ( words ) = > Ok ( Operation ::phrase ( words ) ) ,
2021-03-03 19:03:31 +08:00
}
}
/// Create all ngrams 1..=3 generating query tree branches.
fn ngrams (
ctx : & impl Context ,
authorize_typos : bool ,
query : & [ PrimitiveQueryPart ] ,
2022-08-18 23:36:08 +08:00
any_words : bool ,
2021-06-17 00:33:33 +08:00
) -> Result < Operation > {
2021-03-03 19:03:31 +08:00
const MAX_NGRAM : usize = 3 ;
let mut op_children = Vec ::new ( ) ;
2021-04-09 03:21:20 +08:00
for sub_query in query . linear_group_by ( | a , b | ! ( a . is_phrase ( ) | | b . is_phrase ( ) ) ) {
2021-03-03 19:03:31 +08:00
let mut or_op_children = Vec ::new ( ) ;
for ngram in 1 ..= MAX_NGRAM . min ( sub_query . len ( ) ) {
if let Some ( group ) = sub_query . get ( .. ngram ) {
let mut and_op_children = Vec ::new ( ) ;
let tail = & sub_query [ ngram .. ] ;
let is_last = tail . is_empty ( ) ;
match group {
[ part ] = > {
2021-06-17 00:33:33 +08:00
let operation =
resolve_primitive_part ( ctx , authorize_typos , part . clone ( ) ) ? ;
2021-03-03 19:03:31 +08:00
and_op_children . push ( operation ) ;
2021-06-17 00:33:33 +08:00
}
2021-03-03 19:03:31 +08:00
words = > {
2021-04-09 03:21:20 +08:00
let is_prefix = words . last ( ) . map_or ( false , | part | part . is_prefix ( ) ) ;
2021-06-17 00:33:33 +08:00
let words : Vec < _ > = words
. iter ( )
. filter_map ( | part | {
if let PrimitiveQueryPart ::Word ( word , _ ) = part {
Some ( word . as_str ( ) )
} else {
None
}
} )
. collect ( ) ;
2021-03-03 19:03:31 +08:00
let mut operations = synonyms ( ctx , & words ) ? . unwrap_or_default ( ) ;
let concat = words . concat ( ) ;
2022-04-01 00:23:12 +08:00
let ( word_len_one_typo , word_len_two_typo ) =
ctx . min_word_len_for_typo ( ) ? ;
2022-05-24 15:43:17 +08:00
let exact_words = ctx . exact_words ( ) ;
2022-03-21 23:25:15 +08:00
let config = TypoConfig {
max_typos : 1 ,
word_len_one_typo ,
word_len_two_typo ,
exact_words ,
} ;
2022-02-03 01:45:11 +08:00
let query = Query {
prefix : is_prefix ,
2022-03-21 20:29:59 +08:00
kind : typos ( concat , authorize_typos , config ) ,
2022-02-03 01:45:11 +08:00
} ;
2021-04-08 21:12:37 +08:00
operations . push ( Operation ::Query ( query ) ) ;
and_op_children . push ( Operation ::or ( false , operations ) ) ;
2021-03-03 19:03:31 +08:00
}
}
if ! is_last {
2022-08-18 23:36:08 +08:00
let ngrams = ngrams ( ctx , authorize_typos , tail , any_words ) ? ;
2021-03-03 19:03:31 +08:00
and_op_children . push ( ngrams ) ;
}
2022-08-18 23:36:08 +08:00
if any_words {
or_op_children . push ( Operation ::or ( false , and_op_children ) ) ;
} else {
or_op_children . push ( Operation ::and ( and_op_children ) ) ;
}
2021-03-03 19:03:31 +08:00
}
}
op_children . push ( Operation ::or ( false , or_op_children ) ) ;
}
2022-08-18 23:36:08 +08:00
if any_words {
Ok ( Operation ::or ( false , op_children ) )
} else {
Ok ( Operation ::and ( op_children ) )
}
2021-03-03 19:03:31 +08:00
}
2022-08-18 23:36:08 +08:00
let number_phrases = query . iter ( ) . filter ( | p | p . is_phrase ( ) ) . count ( ) ;
2022-09-01 18:10:47 +08:00
let remove_count = query . len ( ) - max ( number_phrases , 1 ) ;
2022-08-18 23:36:08 +08:00
if remove_count = = 0 {
return ngrams ( ctx , authorize_typos , query , false ) ;
}
2021-03-03 19:03:31 +08:00
2022-08-18 23:36:08 +08:00
let mut operation_children = Vec ::new ( ) ;
let mut query = query . to_vec ( ) ;
2022-09-01 18:10:47 +08:00
for _ in 0 ..= remove_count {
2022-08-22 23:37:36 +08:00
let pos = match terms_matching_strategy {
2022-08-18 23:36:08 +08:00
TermsMatchingStrategy ::All = > return ngrams ( ctx , authorize_typos , & query , false ) ,
TermsMatchingStrategy ::Any = > {
let operation = Operation ::Or (
true ,
vec! [
// branch allowing matching documents to contains any query word.
ngrams ( ctx , authorize_typos , & query , true ) ? ,
// branch forcing matching documents to contains all the query words,
// keeping this documents of the top of the resulted list.
ngrams ( ctx , authorize_typos , & query , false ) ? ,
] ,
) ;
return Ok ( operation ) ;
}
TermsMatchingStrategy ::Last = > query
2021-06-17 00:33:33 +08:00
. iter ( )
2022-08-18 23:36:08 +08:00
. enumerate ( )
. filter ( | ( _ , part ) | ! part . is_phrase ( ) )
. last ( )
. map ( | ( pos , _ ) | pos ) ,
TermsMatchingStrategy ::First = > {
query . iter ( ) . enumerate ( ) . find ( | ( _ , part ) | ! part . is_phrase ( ) ) . map ( | ( pos , _ ) | pos )
}
TermsMatchingStrategy ::Size = > query
. iter ( )
. enumerate ( )
. filter ( | ( _ , part ) | ! part . is_phrase ( ) )
. min_by_key ( | ( _ , part ) | match part {
PrimitiveQueryPart ::Word ( s , _ ) = > s . len ( ) ,
_ = > unreachable! ( ) ,
} )
. map ( | ( pos , _ ) | pos ) ,
TermsMatchingStrategy ::Frequency = > query
. iter ( )
. enumerate ( )
. filter ( | ( _ , part ) | ! part . is_phrase ( ) )
. max_by_key ( | ( _ , part ) | match part {
PrimitiveQueryPart ::Word ( s , _ ) = > {
2022-10-13 15:51:59 +08:00
ctx . word_documents_count ( s ) . unwrap_or_default ( ) . unwrap_or ( u64 ::max_value ( ) )
2021-06-17 00:33:33 +08:00
}
2022-08-18 23:36:08 +08:00
_ = > unreachable! ( ) ,
2021-06-17 00:33:33 +08:00
} )
2022-08-18 23:36:08 +08:00
. map ( | ( pos , _ ) | pos ) ,
} ;
2021-03-03 19:03:31 +08:00
2022-08-18 23:36:08 +08:00
// compute and push the current branch on the front
operation_children . insert ( 0 , ngrams ( ctx , authorize_typos , & query , false ) ? ) ;
// remove word from query before creating an new branch
match pos {
Some ( pos ) = > query . remove ( pos ) ,
None = > break ,
} ;
2021-03-03 19:03:31 +08:00
}
2022-08-18 23:56:06 +08:00
Ok ( Operation ::or ( true , operation_children ) )
2021-03-03 19:03:31 +08:00
}
2022-10-31 20:33:49 +08:00
#[ derive(Default, Debug) ]
struct MatchingWordCache {
all : Vec < Rc < MatchingWord > > ,
map : HashMap < ( String , u8 , bool ) , Rc < MatchingWord > > ,
}
impl MatchingWordCache {
2022-11-24 16:00:53 +08:00
fn insert ( & mut self , word : String , typo : u8 , prefix : bool ) -> Option < Rc < MatchingWord > > {
2022-10-31 20:33:49 +08:00
match self . map . entry ( ( word . clone ( ) , typo , prefix ) ) {
2022-11-24 16:00:53 +08:00
Entry ::Occupied ( idx ) = > Some ( idx . get ( ) . clone ( ) ) ,
2022-10-31 20:33:49 +08:00
Entry ::Vacant ( vacant ) = > {
2022-11-24 16:00:53 +08:00
let matching_word = Rc ::new ( MatchingWord ::new ( word , typo , prefix ) ? ) ;
2022-10-31 20:33:49 +08:00
self . all . push ( matching_word . clone ( ) ) ;
vacant . insert ( matching_word . clone ( ) ) ;
2022-11-24 16:00:53 +08:00
Some ( matching_word )
2022-10-31 20:33:49 +08:00
}
}
2022-11-24 16:00:53 +08:00
// To deactivate the cache, for testing purposes, use the following instead:
// let matching_word = Rc::new(MatchingWord::new(word, typo, prefix)?);
// self.all.push(matching_word.clone());
// Some(matching_word)
2022-10-31 20:33:49 +08:00
}
}
2022-04-05 00:56:59 +08:00
/// Main function that matchings words used for crop and highlight.
fn create_matching_words (
ctx : & impl Context ,
authorize_typos : bool ,
query : & [ PrimitiveQueryPart ] ,
) -> Result < MatchingWords > {
/// Matches on the `PrimitiveQueryPart` and create matchings words from it.
fn resolve_primitive_part (
ctx : & impl Context ,
authorize_typos : bool ,
part : PrimitiveQueryPart ,
2022-10-31 20:33:49 +08:00
matching_words : & mut Vec < ( Vec < Rc < MatchingWord > > , Vec < PrimitiveWordId > ) > ,
matching_word_cache : & mut MatchingWordCache ,
2022-04-05 00:56:59 +08:00
id : PrimitiveWordId ,
) -> Result < ( ) > {
match part {
// 1. try to split word in 2
// 2. try to fetch synonyms
PrimitiveQueryPart ::Word ( word , prefix ) = > {
if let Some ( synonyms ) = ctx . synonyms ( & [ word . as_str ( ) ] ) ? {
for synonym in synonyms {
let synonym = synonym
. into_iter ( )
2022-11-24 16:00:53 +08:00
. flat_map ( | syn | matching_word_cache . insert ( syn , 0 , false ) )
2022-04-05 00:56:59 +08:00
. collect ( ) ;
matching_words . push ( ( synonym , vec! [ id ] ) ) ;
}
}
2022-10-13 15:44:27 +08:00
if let Some ( ( left , right ) ) = split_best_frequency ( ctx , & word ) ? {
2022-11-24 16:00:53 +08:00
if let Some ( left ) = matching_word_cache . insert ( left . to_string ( ) , 0 , false ) {
if let Some ( right ) = matching_word_cache . insert ( right . to_string ( ) , 0 , false )
{
matching_words . push ( ( vec! [ left , right ] , vec! [ id ] ) ) ;
}
}
2022-04-05 00:56:59 +08:00
}
let ( word_len_one_typo , word_len_two_typo ) = ctx . min_word_len_for_typo ( ) ? ;
2022-05-24 15:43:17 +08:00
let exact_words = ctx . exact_words ( ) ;
2022-04-05 00:56:59 +08:00
let config =
TypoConfig { max_typos : 2 , word_len_one_typo , word_len_two_typo , exact_words } ;
let matching_word = match typos ( word , authorize_typos , config ) {
2022-10-31 20:33:49 +08:00
QueryKind ::Exact { word , .. } = > matching_word_cache . insert ( word , 0 , prefix ) ,
QueryKind ::Tolerant { typo , word } = > {
matching_word_cache . insert ( word , typo , prefix )
}
2022-04-05 00:56:59 +08:00
} ;
2022-11-24 16:00:53 +08:00
if let Some ( matching_word ) = matching_word {
matching_words . push ( ( vec! [ matching_word ] , vec! [ id ] ) ) ;
}
2022-04-05 00:56:59 +08:00
}
// create a CONSECUTIVE matchings words wrapping all word in the phrase
PrimitiveQueryPart ::Phrase ( words ) = > {
let ids : Vec < _ > =
( 0 .. words . len ( ) ) . into_iter ( ) . map ( | i | id + i as PrimitiveWordId ) . collect ( ) ;
2022-10-31 20:33:49 +08:00
let words = words
. into_iter ( )
. flatten ( )
2022-11-24 16:00:53 +08:00
. flat_map ( | w | matching_word_cache . insert ( w , 0 , false ) )
2022-10-31 20:33:49 +08:00
. collect ( ) ;
2022-04-05 00:56:59 +08:00
matching_words . push ( ( words , ids ) ) ;
}
}
Ok ( ( ) )
}
/// Create all ngrams 1..=3 generating query tree branches.
fn ngrams (
ctx : & impl Context ,
authorize_typos : bool ,
query : & [ PrimitiveQueryPart ] ,
2022-10-31 20:33:49 +08:00
matching_words : & mut Vec < ( Vec < Rc < MatchingWord > > , Vec < PrimitiveWordId > ) > ,
matching_word_cache : & mut MatchingWordCache ,
2022-04-05 00:56:59 +08:00
mut id : PrimitiveWordId ,
) -> Result < ( ) > {
const MAX_NGRAM : usize = 3 ;
for sub_query in query . linear_group_by ( | a , b | ! ( a . is_phrase ( ) | | b . is_phrase ( ) ) ) {
for ngram in 1 ..= MAX_NGRAM . min ( sub_query . len ( ) ) {
if let Some ( group ) = sub_query . get ( .. ngram ) {
let tail = & sub_query [ ngram .. ] ;
let is_last = tail . is_empty ( ) ;
match group {
[ part ] = > {
resolve_primitive_part (
ctx ,
authorize_typos ,
part . clone ( ) ,
matching_words ,
2022-10-31 20:33:49 +08:00
matching_word_cache ,
2022-04-05 00:56:59 +08:00
id ,
) ? ;
}
words = > {
let is_prefix = words . last ( ) . map_or ( false , | part | part . is_prefix ( ) ) ;
let words : Vec < _ > = words
. iter ( )
. filter_map ( | part | {
if let PrimitiveQueryPart ::Word ( word , _ ) = part {
Some ( word . as_str ( ) )
} else {
None
}
} )
. collect ( ) ;
let ids : Vec < _ > = ( 0 .. words . len ( ) )
. into_iter ( )
. map ( | i | id + i as PrimitiveWordId )
. collect ( ) ;
if let Some ( synonyms ) = ctx . synonyms ( & words ) ? {
for synonym in synonyms {
let synonym = synonym
. into_iter ( )
2022-11-24 16:00:53 +08:00
. flat_map ( | syn | matching_word_cache . insert ( syn , 0 , false ) )
2022-04-05 00:56:59 +08:00
. collect ( ) ;
matching_words . push ( ( synonym , ids . clone ( ) ) ) ;
}
}
let word = words . concat ( ) ;
let ( word_len_one_typo , word_len_two_typo ) =
ctx . min_word_len_for_typo ( ) ? ;
2022-05-24 15:43:17 +08:00
let exact_words = ctx . exact_words ( ) ;
2022-04-05 00:56:59 +08:00
let config = TypoConfig {
max_typos : 1 ,
word_len_one_typo ,
word_len_two_typo ,
exact_words ,
} ;
let matching_word = match typos ( word , authorize_typos , config ) {
QueryKind ::Exact { word , .. } = > {
2022-10-31 20:33:49 +08:00
matching_word_cache . insert ( word , 0 , is_prefix )
2022-04-05 00:56:59 +08:00
}
QueryKind ::Tolerant { typo , word } = > {
2022-10-31 20:33:49 +08:00
matching_word_cache . insert ( word , typo , is_prefix )
2022-04-05 00:56:59 +08:00
}
} ;
2022-11-24 16:00:53 +08:00
if let Some ( matching_word ) = matching_word {
matching_words . push ( ( vec! [ matching_word ] , ids ) ) ;
}
2022-04-05 00:56:59 +08:00
}
}
if ! is_last {
2022-10-31 20:33:49 +08:00
ngrams (
ctx ,
authorize_typos ,
tail ,
matching_words ,
matching_word_cache ,
id + 1 ,
) ? ;
2022-04-05 00:56:59 +08:00
}
}
}
id + = sub_query . iter ( ) . map ( | x | x . len ( ) as PrimitiveWordId ) . sum ::< PrimitiveWordId > ( ) ;
}
Ok ( ( ) )
}
2022-10-31 20:33:49 +08:00
let mut matching_word_cache = MatchingWordCache ::default ( ) ;
2022-04-05 00:56:59 +08:00
let mut matching_words = Vec ::new ( ) ;
2022-10-31 20:33:49 +08:00
ngrams ( ctx , authorize_typos , query , & mut matching_words , & mut matching_word_cache , 0 ) ? ;
2022-04-05 00:56:59 +08:00
Ok ( MatchingWords ::new ( matching_words ) )
}
2021-05-04 19:44:55 +08:00
pub type PrimitiveQuery = Vec < PrimitiveQueryPart > ;
2021-03-03 19:03:31 +08:00
#[ derive(Debug, Clone) ]
2021-05-04 19:44:55 +08:00
pub enum PrimitiveQueryPart {
2022-10-26 21:38:06 +08:00
Phrase ( Vec < Option < String > > ) ,
2021-03-03 19:03:31 +08:00
Word ( String , IsPrefix ) ,
}
impl PrimitiveQueryPart {
fn is_phrase ( & self ) -> bool {
matches! ( self , Self ::Phrase ( _ ) )
}
fn is_prefix ( & self ) -> bool {
matches! ( self , Self ::Word ( _ , is_prefix ) if * is_prefix )
}
2022-04-05 00:56:59 +08:00
fn len ( & self ) -> usize {
match self {
Self ::Phrase ( words ) = > words . len ( ) ,
Self ::Word ( _ , _ ) = > 1 ,
}
}
2021-03-03 19:03:31 +08:00
}
/// Create primitive query from tokenized query string,
/// the primitive query is an intermediate state to build the query tree.
2022-06-02 21:47:28 +08:00
fn create_primitive_query < A > (
query : ClassifiedTokenIter < A > ,
2021-06-17 00:33:33 +08:00
words_limit : Option < usize > ,
2022-06-02 21:47:28 +08:00
) -> PrimitiveQuery
where
A : AsRef < [ u8 ] > ,
{
2021-03-03 19:03:31 +08:00
let mut primitive_query = Vec ::new ( ) ;
let mut phrase = Vec ::new ( ) ;
let mut quoted = false ;
2021-04-14 01:10:58 +08:00
let parts_limit = words_limit . unwrap_or ( usize ::MAX ) ;
2021-03-03 19:03:31 +08:00
let mut peekable = query . peekable ( ) ;
while let Some ( token ) = peekable . next ( ) {
2021-04-14 01:10:58 +08:00
// early return if word limit is exceeded
2021-06-17 00:33:33 +08:00
if primitive_query . len ( ) > = parts_limit {
return primitive_query ;
}
2021-04-14 01:10:58 +08:00
2021-03-03 19:03:31 +08:00
match token . kind {
2021-06-17 00:33:33 +08:00
TokenKind ::Word | TokenKind ::StopWord = > {
2021-03-03 19:03:31 +08:00
// 1. if the word is quoted we push it in a phrase-buffer waiting for the ending quote,
2021-04-09 03:21:20 +08:00
// 2. if the word is not the last token of the query and is not a stop_word we push it as a non-prefix word,
2021-03-03 19:03:31 +08:00
// 3. if the word is the last token of the query we push it as a prefix word.
if quoted {
2022-10-20 21:05:39 +08:00
if let TokenKind ::StopWord = token . kind {
2022-10-26 21:38:06 +08:00
phrase . push ( None )
} else {
phrase . push ( Some ( token . lemma ( ) . to_string ( ) ) ) ;
}
2021-03-03 19:03:31 +08:00
} else if peekable . peek ( ) . is_some ( ) {
2022-10-20 21:05:39 +08:00
if let TokenKind ::StopWord = token . kind {
} else {
2021-06-17 00:33:33 +08:00
primitive_query
2022-06-02 21:47:28 +08:00
. push ( PrimitiveQueryPart ::Word ( token . lemma ( ) . to_string ( ) , false ) ) ;
2021-06-17 00:33:33 +08:00
}
2021-03-03 19:03:31 +08:00
} else {
2022-06-02 21:47:28 +08:00
primitive_query . push ( PrimitiveQueryPart ::Word ( token . lemma ( ) . to_string ( ) , true ) ) ;
2021-03-03 19:03:31 +08:00
}
2021-06-17 00:33:33 +08:00
}
2021-06-08 23:29:38 +08:00
TokenKind ::Separator ( separator_kind ) = > {
2022-06-02 21:47:28 +08:00
let quote_count = token . lemma ( ) . chars ( ) . filter ( | & s | s = = '"' ) . count ( ) ;
2021-03-03 19:03:31 +08:00
// swap quoted state if we encounter a double quote
if quote_count % 2 ! = 0 {
quoted = ! quoted ;
}
2021-06-08 23:52:37 +08:00
// if there is a quote or a hard separator we close the phrase.
2021-06-17 00:33:33 +08:00
if ! phrase . is_empty ( ) & & ( quote_count > 0 | | separator_kind = = SeparatorKind ::Hard )
{
2021-03-03 19:03:31 +08:00
primitive_query . push ( PrimitiveQueryPart ::Phrase ( mem ::take ( & mut phrase ) ) ) ;
}
2021-06-17 00:33:33 +08:00
}
2021-03-03 19:03:31 +08:00
_ = > ( ) ,
}
}
// If a quote is never closed, we consider all of the end of the query as a phrase.
if ! phrase . is_empty ( ) {
primitive_query . push ( PrimitiveQueryPart ::Phrase ( mem ::take ( & mut phrase ) ) ) ;
}
primitive_query
}
2021-02-23 22:50:33 +08:00
/// Returns the maximum number of typos that this Operation allows.
pub fn maximum_typo ( operation : & Operation ) -> usize {
2021-06-17 00:33:33 +08:00
use Operation ::{ And , Or , Phrase , Query } ;
2021-02-23 22:50:33 +08:00
match operation {
Or ( _ , ops ) = > ops . iter ( ) . map ( maximum_typo ) . max ( ) . unwrap_or ( 0 ) ,
2021-06-09 23:28:12 +08:00
And ( ops ) = > ops . iter ( ) . map ( maximum_typo ) . sum ::< usize > ( ) ,
2021-02-23 22:50:33 +08:00
Query ( q ) = > q . kind . typo ( ) as usize ,
2021-06-09 23:28:12 +08:00
// no typo allowed in phrases
Phrase ( _ ) = > 0 ,
2021-02-23 22:50:33 +08:00
}
}
2021-02-23 22:53:24 +08:00
/// Returns the maximum proximity that this Operation allows.
pub fn maximum_proximity ( operation : & Operation ) -> usize {
2021-06-17 00:33:33 +08:00
use Operation ::{ And , Or , Phrase , Query } ;
2021-02-23 22:53:24 +08:00
match operation {
Or ( _ , ops ) = > ops . iter ( ) . map ( maximum_proximity ) . max ( ) . unwrap_or ( 0 ) ,
2021-02-24 22:36:57 +08:00
And ( ops ) = > {
2021-06-17 00:33:33 +08:00
ops . iter ( ) . map ( maximum_proximity ) . sum ::< usize > ( ) + ops . len ( ) . saturating_sub ( 1 ) * 7
}
2021-06-09 23:28:12 +08:00
Query ( _ ) | Phrase ( _ ) = > 0 ,
2021-02-23 22:53:24 +08:00
}
}
2021-03-03 19:03:31 +08:00
#[ cfg(test) ]
mod test {
2022-10-31 20:33:49 +08:00
use std ::alloc ::{ GlobalAlloc , System } ;
2021-03-02 18:30:48 +08:00
use std ::collections ::HashMap ;
2022-10-31 20:33:49 +08:00
use std ::sync ::atomic ::{ self , AtomicI64 } ;
2021-03-02 18:30:48 +08:00
2022-06-02 21:47:28 +08:00
use charabia ::Tokenize ;
2021-06-01 17:48:56 +08:00
use maplit ::hashmap ;
2021-06-17 00:33:33 +08:00
use rand ::rngs ::StdRng ;
use rand ::{ Rng , SeedableRng } ;
2021-03-03 19:03:31 +08:00
use super ::* ;
2022-10-31 20:33:49 +08:00
use crate ::index ::tests ::TempIndex ;
2022-04-01 17:21:51 +08:00
use crate ::index ::{ DEFAULT_MIN_WORD_LEN_ONE_TYPO , DEFAULT_MIN_WORD_LEN_TWO_TYPOS } ;
2021-03-02 18:30:48 +08:00
2021-03-03 19:03:31 +08:00
#[ derive(Debug) ]
struct TestContext {
synonyms : HashMap < Vec < String > , Vec < Vec < String > > > ,
postings : HashMap < String , RoaringBitmap > ,
2022-05-24 15:43:17 +08:00
exact_words : Option < fst ::Set < Cow < 'static , [ u8 ] > > > ,
2021-03-03 19:03:31 +08:00
}
impl TestContext {
2022-06-02 21:47:28 +08:00
fn build < A : AsRef < [ u8 ] > > (
2021-03-03 19:03:31 +08:00
& self ,
2022-08-22 23:37:36 +08:00
terms_matching_strategy : TermsMatchingStrategy ,
2021-03-03 19:03:31 +08:00
authorize_typos : bool ,
2021-04-14 01:10:58 +08:00
words_limit : Option < usize > ,
2022-06-02 21:47:28 +08:00
query : ClassifiedTokenIter < A > ,
2021-06-17 00:33:33 +08:00
) -> Result < Option < ( Operation , PrimitiveQuery ) > > {
2022-10-20 21:05:39 +08:00
let primitive_query = create_primitive_query ( query , words_limit ) ;
2021-03-03 19:03:31 +08:00
if ! primitive_query . is_empty ( ) {
2022-08-22 23:37:36 +08:00
let qt = create_query_tree (
self ,
terms_matching_strategy ,
authorize_typos ,
& primitive_query ,
) ? ;
2021-05-04 19:44:55 +08:00
Ok ( Some ( ( qt , primitive_query ) ) )
2021-03-03 19:03:31 +08:00
} else {
Ok ( None )
}
}
}
impl Context for TestContext {
fn word_docids ( & self , word : & str ) -> heed ::Result < Option < RoaringBitmap > > {
Ok ( self . postings . get ( word ) . cloned ( ) )
}
2021-04-10 03:56:20 +08:00
fn synonyms < S : AsRef < str > > ( & self , words : & [ S ] ) -> heed ::Result < Option < Vec < Vec < String > > > > {
let words : Vec < _ > = words . iter ( ) . map ( | s | s . as_ref ( ) . to_owned ( ) ) . collect ( ) ;
2021-03-03 19:03:31 +08:00
Ok ( self . synonyms . get ( & words ) . cloned ( ) )
}
2022-03-21 20:29:59 +08:00
fn min_word_len_for_typo ( & self ) -> heed ::Result < ( u8 , u8 ) > {
2022-04-01 00:42:10 +08:00
Ok ( ( DEFAULT_MIN_WORD_LEN_ONE_TYPO , DEFAULT_MIN_WORD_LEN_TWO_TYPOS ) )
2022-03-21 20:29:59 +08:00
}
2022-03-21 23:25:15 +08:00
2022-05-24 18:14:55 +08:00
fn exact_words ( & self ) -> Option < & fst ::Set < Cow < [ u8 ] > > > {
self . exact_words . as_ref ( )
2022-03-21 23:25:15 +08:00
}
2022-10-13 15:27:50 +08:00
fn word_pair_frequency (
& self ,
left_word : & str ,
right_word : & str ,
_proximity : u8 ,
) -> heed ::Result < Option < u64 > > {
match self . word_docids ( & format! ( " {} {} " , left_word , right_word ) ) ? {
Some ( rb ) = > Ok ( Some ( rb . len ( ) ) ) ,
None = > Ok ( None ) ,
}
}
2021-03-03 19:03:31 +08:00
}
impl Default for TestContext {
fn default ( ) -> TestContext {
let mut rng = StdRng ::seed_from_u64 ( 102 ) ;
let rng = & mut rng ;
fn random_postings < R : Rng > ( rng : & mut R , len : usize ) -> RoaringBitmap {
let mut values = Vec ::< u32 > ::with_capacity ( len ) ;
while values . len ( ) ! = len {
values . push ( rng . gen ( ) ) ;
}
values . sort_unstable ( ) ;
2022-03-15 00:13:07 +08:00
RoaringBitmap ::from_sorted_iter ( values . into_iter ( ) ) . unwrap ( )
2021-03-03 19:03:31 +08:00
}
2022-03-22 16:55:49 +08:00
let exact_words = fst ::SetBuilder ::new ( Vec ::new ( ) ) . unwrap ( ) . into_inner ( ) . unwrap ( ) ;
2022-05-24 15:43:17 +08:00
let exact_words =
Some ( fst ::Set ::new ( exact_words ) . unwrap ( ) . map_data ( Cow ::Owned ) . unwrap ( ) ) ;
2022-03-22 16:55:49 +08:00
2021-03-03 19:03:31 +08:00
TestContext {
2021-06-17 00:33:33 +08:00
synonyms : hashmap ! {
2021-03-03 19:03:31 +08:00
vec! [ String ::from ( " hello " ) ] = > vec! [
vec! [ String ::from ( " hi " ) ] ,
vec! [ String ::from ( " good " ) , String ::from ( " morning " ) ] ,
] ,
vec! [ String ::from ( " world " ) ] = > vec! [
vec! [ String ::from ( " earth " ) ] ,
vec! [ String ::from ( " nature " ) ] ,
] ,
// new york city
vec! [ String ::from ( " nyc " ) ] = > vec! [
vec! [ String ::from ( " new " ) , String ::from ( " york " ) ] ,
vec! [ String ::from ( " new " ) , String ::from ( " york " ) , String ::from ( " city " ) ] ,
] ,
vec! [ String ::from ( " new " ) , String ::from ( " york " ) ] = > vec! [
vec! [ String ::from ( " nyc " ) ] ,
vec! [ String ::from ( " new " ) , String ::from ( " york " ) , String ::from ( " city " ) ] ,
] ,
vec! [ String ::from ( " new " ) , String ::from ( " york " ) , String ::from ( " city " ) ] = > vec! [
vec! [ String ::from ( " nyc " ) ] ,
vec! [ String ::from ( " new " ) , String ::from ( " york " ) ] ,
] ,
} ,
2021-06-17 00:33:33 +08:00
postings : hashmap ! {
2022-10-12 15:48:23 +08:00
String ::from ( " hello " ) = > random_postings ( rng , 1500 ) ,
String ::from ( " hi " ) = > random_postings ( rng , 4000 ) ,
String ::from ( " word " ) = > random_postings ( rng , 2500 ) ,
String ::from ( " split " ) = > random_postings ( rng , 400 ) ,
String ::from ( " ngrams " ) = > random_postings ( rng , 1400 ) ,
String ::from ( " world " ) = > random_postings ( rng , 15_000 ) ,
String ::from ( " earth " ) = > random_postings ( rng , 8000 ) ,
String ::from ( " 2021 " ) = > random_postings ( rng , 100 ) ,
String ::from ( " 2020 " ) = > random_postings ( rng , 500 ) ,
String ::from ( " is " ) = > random_postings ( rng , 50_000 ) ,
String ::from ( " this " ) = > random_postings ( rng , 50_000 ) ,
String ::from ( " good " ) = > random_postings ( rng , 1250 ) ,
String ::from ( " morning " ) = > random_postings ( rng , 125 ) ,
String ::from ( " word split " ) = > random_postings ( rng , 5000 ) ,
String ::from ( " quick brownfox " ) = > random_postings ( rng , 7000 ) ,
String ::from ( " quickbrown fox " ) = > random_postings ( rng , 8000 ) ,
2021-03-03 19:03:31 +08:00
} ,
2022-03-22 16:55:49 +08:00
exact_words ,
2021-03-03 19:03:31 +08:00
}
}
}
#[ test ]
fn prefix ( ) {
let query = " hey friends " ;
2022-06-02 21:47:28 +08:00
let tokens = query . tokenize ( ) ;
2021-03-03 19:03:31 +08:00
2022-08-18 23:36:08 +08:00
let ( query_tree , _ ) = TestContext ::default ( )
. build ( TermsMatchingStrategy ::All , true , None , tokens )
. unwrap ( )
. unwrap ( ) ;
2021-03-03 19:03:31 +08:00
2022-08-04 17:00:46 +08:00
insta ::assert_debug_snapshot! ( query_tree , @ r ###"
OR
AND
Exact { word : " hey " }
PrefixTolerant { word : " friends " , max typo : 1 }
PrefixTolerant { word : " heyfriends " , max typo : 1 }
" ###);
2021-03-03 19:03:31 +08:00
}
#[ test ]
fn no_prefix ( ) {
let query = " hey friends " ;
2022-06-02 21:47:28 +08:00
let tokens = query . tokenize ( ) ;
2021-03-03 19:03:31 +08:00
2022-08-18 23:36:08 +08:00
let ( query_tree , _ ) = TestContext ::default ( )
. build ( TermsMatchingStrategy ::All , true , None , tokens )
. unwrap ( )
. unwrap ( ) ;
2021-03-03 19:03:31 +08:00
2022-08-04 17:00:46 +08:00
insta ::assert_debug_snapshot! ( query_tree , @ r ###"
OR
AND
Exact { word : " hey " }
Tolerant { word : " friends " , max typo : 1 }
Tolerant { word : " heyfriends " , max typo : 1 }
" ###);
2021-03-03 19:03:31 +08:00
}
#[ test ]
fn synonyms ( ) {
let query = " hello world " ;
2022-06-02 21:47:28 +08:00
let tokens = query . tokenize ( ) ;
2021-03-03 19:03:31 +08:00
2022-08-18 23:36:08 +08:00
let ( query_tree , _ ) = TestContext ::default ( )
. build ( TermsMatchingStrategy ::All , true , None , tokens )
. unwrap ( )
. unwrap ( ) ;
2021-03-03 19:03:31 +08:00
2022-08-04 17:00:46 +08:00
insta ::assert_debug_snapshot! ( query_tree , @ r ###"
OR
AND
OR
Exact { word : " hi " }
AND
Exact { word : " good " }
Exact { word : " morning " }
Tolerant { word : " hello " , max typo : 1 }
OR
Exact { word : " earth " }
Exact { word : " nature " }
Tolerant { word : " world " , max typo : 1 }
Tolerant { word : " helloworld " , max typo : 1 }
" ###);
2021-03-03 19:03:31 +08:00
}
#[ test ]
fn complex_synonyms ( ) {
let query = " new york city " ;
2022-06-02 21:47:28 +08:00
let tokens = query . tokenize ( ) ;
2021-03-03 19:03:31 +08:00
2022-08-18 23:36:08 +08:00
let ( query_tree , _ ) = TestContext ::default ( )
. build ( TermsMatchingStrategy ::All , true , None , tokens )
. unwrap ( )
. unwrap ( ) ;
2021-03-03 19:03:31 +08:00
2022-08-04 17:00:46 +08:00
insta ::assert_debug_snapshot! ( query_tree , @ r ###"
OR
AND
Exact { word : " new " }
OR
AND
Exact { word : " york " }
Exact { word : " city " }
Tolerant { word : " yorkcity " , max typo : 1 }
AND
OR
Exact { word : " nyc " }
AND
Exact { word : " new " }
Exact { word : " york " }
Exact { word : " city " }
Tolerant { word : " newyork " , max typo : 1 }
Exact { word : " city " }
2022-08-18 23:36:08 +08:00
Exact { word : " nyc " }
AND
Exact { word : " new " }
Exact { word : " york " }
Tolerant { word : " newyorkcity " , max typo : 1 }
2022-08-04 17:00:46 +08:00
" ###);
2021-03-03 19:03:31 +08:00
}
#[ test ]
fn ngrams ( ) {
let query = " n grams " ;
2022-06-02 21:47:28 +08:00
let tokens = query . tokenize ( ) ;
2021-03-03 19:03:31 +08:00
2022-08-18 23:36:08 +08:00
let ( query_tree , _ ) = TestContext ::default ( )
. build ( TermsMatchingStrategy ::All , true , None , tokens )
. unwrap ( )
. unwrap ( ) ;
2021-03-03 19:03:31 +08:00
2022-08-04 17:00:46 +08:00
insta ::assert_debug_snapshot! ( query_tree , @ r ###"
OR
AND
Exact { word : " n " }
Tolerant { word : " grams " , max typo : 1 }
Tolerant { word : " ngrams " , max typo : 1 }
" ###);
2021-03-03 19:03:31 +08:00
}
#[ test ]
fn word_split ( ) {
let query = " wordsplit fish " ;
2022-06-02 21:47:28 +08:00
let tokens = query . tokenize ( ) ;
2021-03-03 19:03:31 +08:00
2022-08-18 23:36:08 +08:00
let ( query_tree , _ ) = TestContext ::default ( )
. build ( TermsMatchingStrategy ::All , true , None , tokens )
. unwrap ( )
. unwrap ( ) ;
2021-03-03 19:03:31 +08:00
2022-08-04 17:00:46 +08:00
insta ::assert_debug_snapshot! ( query_tree , @ r ###"
OR
AND
OR
2022-10-14 02:00:58 +08:00
PHRASE [ Some ( " word " ) , Some ( " split " ) ]
2022-08-04 17:00:46 +08:00
Tolerant { word : " wordsplit " , max typo : 2 }
Exact { word : " fish " }
Tolerant { word : " wordsplitfish " , max typo : 1 }
" ###);
2021-03-03 19:03:31 +08:00
}
2022-10-12 15:48:23 +08:00
#[ test ]
fn word_split_choose_pair_with_max_freq ( ) {
let query = " quickbrownfox " ;
let tokens = query . tokenize ( ) ;
let ( query_tree , _ ) = TestContext ::default ( )
. build ( TermsMatchingStrategy ::All , true , None , tokens )
. unwrap ( )
. unwrap ( ) ;
insta ::assert_debug_snapshot! ( query_tree , @ r ###"
OR
2022-10-14 02:00:58 +08:00
PHRASE [ Some ( " quickbrown " ) , Some ( " fox " ) ]
2022-10-12 15:48:23 +08:00
PrefixTolerant { word : " quickbrownfox " , max typo : 2 }
" ###);
}
2021-03-03 19:03:31 +08:00
#[ test ]
fn phrase ( ) {
let query = " \" hey friends \" \" \" \" wooop " ;
2022-06-02 21:47:28 +08:00
let tokens = query . tokenize ( ) ;
2021-03-03 19:03:31 +08:00
2022-08-18 23:36:08 +08:00
let ( query_tree , _ ) = TestContext ::default ( )
. build ( TermsMatchingStrategy ::All , true , None , tokens )
. unwrap ( )
. unwrap ( ) ;
2021-03-03 19:03:31 +08:00
2022-08-04 17:00:46 +08:00
insta ::assert_debug_snapshot! ( query_tree , @ r ###"
AND
2022-10-14 02:00:58 +08:00
PHRASE [ Some ( " hey " ) , Some ( " friends " ) ]
2022-08-04 17:00:46 +08:00
Exact { word : " wooop " }
" ###);
2021-03-03 19:03:31 +08:00
}
2022-09-01 18:02:10 +08:00
#[ test ]
fn phrase_2 ( ) {
// https://github.com/meilisearch/meilisearch/issues/2722
let query = " coco \" harry \" " ;
let tokens = query . tokenize ( ) ;
let ( query_tree , _ ) = TestContext ::default ( )
. build ( TermsMatchingStrategy ::default ( ) , true , None , tokens )
. unwrap ( )
. unwrap ( ) ;
insta ::assert_debug_snapshot! ( query_tree , @ r ###"
OR ( WORD )
Exact { word : " harry " }
AND
Exact { word : " coco " }
Exact { word : " harry " }
" ###);
}
2021-06-08 23:29:38 +08:00
#[ test ]
fn phrase_with_hard_separator ( ) {
let query = " \" hey friends. wooop wooop \" " ;
2022-06-02 21:47:28 +08:00
let tokens = query . tokenize ( ) ;
2021-06-08 23:29:38 +08:00
2022-08-18 23:36:08 +08:00
let ( query_tree , _ ) = TestContext ::default ( )
. build ( TermsMatchingStrategy ::All , true , None , tokens )
. unwrap ( )
. unwrap ( ) ;
2021-06-08 23:29:38 +08:00
2022-08-04 17:00:46 +08:00
insta ::assert_debug_snapshot! ( query_tree , @ r ###"
AND
2022-10-14 02:00:58 +08:00
PHRASE [ Some ( " hey " ) , Some ( " friends " ) ]
PHRASE [ Some ( " wooop " ) , Some ( " wooop " ) ]
2022-08-04 17:00:46 +08:00
" ###);
2021-06-08 23:29:38 +08:00
}
2021-03-03 19:03:31 +08:00
#[ test ]
fn optional_word ( ) {
let query = " hey my friend " ;
2022-06-02 21:47:28 +08:00
let tokens = query . tokenize ( ) ;
2021-03-03 19:03:31 +08:00
2022-08-18 23:36:08 +08:00
let ( query_tree , _ ) = TestContext ::default ( )
. build ( TermsMatchingStrategy ::default ( ) , true , None , tokens )
. unwrap ( )
. unwrap ( ) ;
2021-02-19 05:18:36 +08:00
2022-08-04 17:00:46 +08:00
insta ::assert_debug_snapshot! ( query_tree , @ r ###"
OR ( WORD )
Exact { word : " hey " }
OR
AND
Exact { word : " hey " }
Exact { word : " my " }
Tolerant { word : " heymy " , max typo : 1 }
OR
AND
Exact { word : " hey " }
OR
AND
Exact { word : " my " }
Tolerant { word : " friend " , max typo : 1 }
Tolerant { word : " myfriend " , max typo : 1 }
AND
Tolerant { word : " heymy " , max typo : 1 }
Tolerant { word : " friend " , max typo : 1 }
Tolerant { word : " heymyfriend " , max typo : 1 }
" ###);
2021-02-19 05:18:36 +08:00
}
#[ test ]
fn optional_word_phrase ( ) {
let query = " \" hey my \" " ;
2022-06-02 21:47:28 +08:00
let tokens = query . tokenize ( ) ;
2021-02-19 05:18:36 +08:00
2022-08-18 23:36:08 +08:00
let ( query_tree , _ ) = TestContext ::default ( )
. build ( TermsMatchingStrategy ::default ( ) , true , None , tokens )
. unwrap ( )
. unwrap ( ) ;
2021-02-19 05:18:36 +08:00
2022-08-04 17:00:46 +08:00
insta ::assert_debug_snapshot! ( query_tree , @ r ###"
2022-10-14 02:00:58 +08:00
PHRASE [ Some ( " hey " ) , Some ( " my " ) ]
2022-08-04 17:00:46 +08:00
" ###);
2021-02-19 05:18:36 +08:00
}
#[ test ]
fn optional_word_multiple_phrases ( ) {
let query = r # ""hey" my good "friend""# ;
2022-06-02 21:47:28 +08:00
let tokens = query . tokenize ( ) ;
2021-02-19 05:18:36 +08:00
2022-08-18 23:36:08 +08:00
let ( query_tree , _ ) = TestContext ::default ( )
. build ( TermsMatchingStrategy ::default ( ) , true , None , tokens )
. unwrap ( )
. unwrap ( ) ;
2021-03-03 19:03:31 +08:00
2022-08-04 17:00:46 +08:00
insta ::assert_debug_snapshot! ( query_tree , @ r ###"
OR ( WORD )
AND
Exact { word : " hey " }
Exact { word : " friend " }
AND
Exact { word : " hey " }
Exact { word : " my " }
Exact { word : " friend " }
AND
Exact { word : " hey " }
OR
AND
Exact { word : " my " }
Exact { word : " good " }
Tolerant { word : " mygood " , max typo : 1 }
Exact { word : " friend " }
" ###);
2021-03-03 19:03:31 +08:00
}
#[ test ]
fn no_typo ( ) {
let query = " hey friends " ;
2022-06-02 21:47:28 +08:00
let tokens = query . tokenize ( ) ;
2021-03-03 19:03:31 +08:00
2022-08-18 23:36:08 +08:00
let ( query_tree , _ ) = TestContext ::default ( )
. build ( TermsMatchingStrategy ::All , false , None , tokens )
. unwrap ( )
. unwrap ( ) ;
2021-03-03 19:03:31 +08:00
2022-08-04 17:00:46 +08:00
insta ::assert_debug_snapshot! ( query_tree , @ r ###"
OR
AND
Exact { word : " hey " }
Exact { word : " friends " }
Exact { word : " heyfriends " }
" ###);
2021-03-03 19:03:31 +08:00
}
2021-04-14 01:10:58 +08:00
#[ test ]
fn words_limit ( ) {
let query = " \" hey my \" good friend " ;
2022-06-02 21:47:28 +08:00
let tokens = query . tokenize ( ) ;
2021-04-14 01:10:58 +08:00
2022-08-18 23:36:08 +08:00
let ( query_tree , _ ) = TestContext ::default ( )
. build ( TermsMatchingStrategy ::All , false , Some ( 2 ) , tokens )
. unwrap ( )
. unwrap ( ) ;
2021-04-14 01:10:58 +08:00
2022-08-04 17:00:46 +08:00
insta ::assert_debug_snapshot! ( query_tree , @ r ###"
AND
2022-10-14 02:00:58 +08:00
PHRASE [ Some ( " hey " ) , Some ( " my " ) ]
2022-08-04 17:00:46 +08:00
Exact { word : " good " }
" ###);
2021-04-14 01:10:58 +08:00
}
2022-03-31 19:50:18 +08:00
#[ test ]
fn test_min_word_len_typo ( ) {
2022-05-24 18:14:55 +08:00
let exact_words = fst ::Set ::from_iter ( [ b " " ] ) . unwrap ( ) . map_data ( Cow ::Owned ) . unwrap ( ) ;
2022-05-24 15:43:17 +08:00
let config = TypoConfig {
max_typos : 2 ,
word_len_one_typo : 5 ,
word_len_two_typo : 7 ,
2022-05-24 18:14:55 +08:00
exact_words : Some ( & exact_words ) ,
2022-05-24 15:43:17 +08:00
} ;
2022-03-31 19:50:18 +08:00
assert_eq! (
typos ( " hello " . to_string ( ) , true , config . clone ( ) ) ,
QueryKind ::Tolerant { typo : 1 , word : " hello " . to_string ( ) }
) ;
assert_eq! (
typos ( " hell " . to_string ( ) , true , config . clone ( ) ) ,
QueryKind ::exact ( " hell " . to_string ( ) )
) ;
assert_eq! (
typos ( " verylongword " . to_string ( ) , true , config . clone ( ) ) ,
QueryKind ::Tolerant { typo : 2 , word : " verylongword " . to_string ( ) }
) ;
}
2022-03-22 16:55:49 +08:00
2022-11-24 16:00:53 +08:00
#[ test ]
fn test_dont_create_matching_word_for_long_words ( ) {
let index = TempIndex ::new ( ) ;
let rtxn = index . read_txn ( ) . unwrap ( ) ;
let query = " what a supercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocious house " ;
let mut builder = QueryTreeBuilder ::new ( & rtxn , & index ) . unwrap ( ) ;
builder . words_limit ( 10 ) ;
let ( _ , _ , matching_words ) = builder . build ( query . tokenize ( ) ) . unwrap ( ) . unwrap ( ) ;
insta ::assert_snapshot! ( format! ( " {matching_words:?} " ) , @ r ###"
[
( [ MatchingWord { word : " house " , typo : 1 , prefix : true } ] , [ 3 ] )
( [ MatchingWord { word : " house " , typo : 1 , prefix : true } ] , [ 2 ] )
( [ MatchingWord { word : " whata " , typo : 1 , prefix : false } ] , [ 0 , 1 ] )
( [ MatchingWord { word : " house " , typo : 1 , prefix : true } ] , [ 2 ] )
( [ MatchingWord { word : " house " , typo : 1 , prefix : true } ] , [ 1 ] )
( [ MatchingWord { word : " what " , typo : 0 , prefix : false } ] , [ 0 ] )
( [ MatchingWord { word : " a " , typo : 0 , prefix : false } ] , [ 1 ] )
]
" ###);
}
2022-03-22 16:55:49 +08:00
#[ test ]
fn disable_typo_on_word ( ) {
let query = " goodbye " ;
2022-06-02 21:47:28 +08:00
let tokens = query . tokenize ( ) ;
2022-03-22 16:55:49 +08:00
let exact_words = fst ::Set ::from_iter ( Some ( " goodbye " ) ) . unwrap ( ) . into_fst ( ) . into_inner ( ) ;
2022-05-24 15:43:17 +08:00
let exact_words = Some ( fst ::Set ::new ( exact_words ) . unwrap ( ) . map_data ( Cow ::Owned ) . unwrap ( ) ) ;
2022-03-22 16:55:49 +08:00
let context = TestContext { exact_words , .. Default ::default ( ) } ;
2022-08-18 23:36:08 +08:00
let ( query_tree , _ ) =
context . build ( TermsMatchingStrategy ::All , true , Some ( 2 ) , tokens ) . unwrap ( ) . unwrap ( ) ;
2022-03-22 16:55:49 +08:00
assert! ( matches! (
2022-04-04 17:52:35 +08:00
query_tree ,
2022-03-22 16:55:49 +08:00
Operation ::Query ( Query { prefix : true , kind : QueryKind ::Exact { .. } } )
) ) ;
}
2022-10-31 20:33:49 +08:00
#[ global_allocator ]
static ALLOC : CountingAlloc =
CountingAlloc { resident : AtomicI64 ::new ( 0 ) , allocated : AtomicI64 ::new ( 0 ) } ;
pub struct CountingAlloc {
pub resident : AtomicI64 ,
pub allocated : AtomicI64 ,
}
unsafe impl GlobalAlloc for CountingAlloc {
unsafe fn alloc ( & self , layout : std ::alloc ::Layout ) -> * mut u8 {
2022-11-24 16:29:10 +08:00
self . allocated . fetch_add ( layout . size ( ) as i64 , atomic ::Ordering ::Relaxed ) ;
self . resident . fetch_add ( layout . size ( ) as i64 , atomic ::Ordering ::Relaxed ) ;
2022-10-31 20:33:49 +08:00
System . alloc ( layout )
}
unsafe fn dealloc ( & self , ptr : * mut u8 , layout : std ::alloc ::Layout ) {
2022-11-24 16:29:10 +08:00
self . resident . fetch_sub ( layout . size ( ) as i64 , atomic ::Ordering ::Relaxed ) ;
2022-10-31 20:33:49 +08:00
System . dealloc ( ptr , layout )
}
}
#[ test ]
2022-11-24 16:00:53 +08:00
fn memory_usage_of_ten_word_query ( ) {
2022-10-31 20:33:49 +08:00
let resident_before = ALLOC . resident . load ( atomic ::Ordering ::SeqCst ) ;
let allocated_before = ALLOC . allocated . load ( atomic ::Ordering ::SeqCst ) ;
let index = TempIndex ::new ( ) ;
let rtxn = index . read_txn ( ) . unwrap ( ) ;
let query = " a beautiful summer house by the beach overlooking what seems " ;
let mut builder = QueryTreeBuilder ::new ( & rtxn , & index ) . unwrap ( ) ;
builder . words_limit ( 10 ) ;
let x = builder . build ( query . tokenize ( ) ) . unwrap ( ) . unwrap ( ) ;
let resident_after = ALLOC . resident . load ( atomic ::Ordering ::SeqCst ) ;
let allocated_after = ALLOC . allocated . load ( atomic ::Ordering ::SeqCst ) ;
2022-11-24 16:00:53 +08:00
// Weak check on the memory usage
// Don't keep more than 5MB. (Arguably 5MB is already too high)
assert! ( resident_after - resident_before < 5_000_000 ) ;
// Don't allocate more than 10MB.
assert! ( allocated_after - allocated_before < 10_000_000 ) ;
// Use these snapshots to measure the exact memory usage.
// The values below were correct at the time I wrote them.
// insta::assert_snapshot!(format!("{}", resident_after - resident_before), @"4486950");
// insta::assert_snapshot!(format!("{}", allocated_after - allocated_before), @"7107502");
2022-10-31 20:33:49 +08:00
2022-11-24 16:00:53 +08:00
// Note, with the matching word cache deactivated, the memory usage was:
// insta::assert_snapshot!(format!("{}", resident_after - resident_before), @"91248697");
// insta::assert_snapshot!(format!("{}", allocated_after - allocated_before), @"125697588");
2022-10-31 20:33:49 +08:00
// or about 20x more resident memory (90MB vs 4.5MB)
// Use x
let _x = x ;
}
2021-03-03 19:03:31 +08:00
}