2023-04-05 17:20:04 +08:00
|
|
|
/*!
|
|
|
|
This module tests the Proximity ranking rule:
|
|
|
|
|
|
|
|
1. A proximity of >7 always has the same cost.
|
|
|
|
|
|
|
|
2. Phrase terms can be in proximity to other terms via their start and end words,
|
|
|
|
but we need to make sure that the phrase exists in the document that meets this
|
|
|
|
proximity condition. This is especially relevant with split words and synonyms.
|
|
|
|
|
|
|
|
3. An ngram has the same proximity cost as its component words being consecutive.
|
|
|
|
e.g. `sunflower` equivalent to `sun flower`.
|
|
|
|
|
|
|
|
4. The prefix databases can be used to find the proximity between two words, but
|
|
|
|
they store fewer proximities than the regular word proximity DB.
|
|
|
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
use std::collections::HashMap;
|
|
|
|
|
|
|
|
use crate::{
|
|
|
|
index::tests::TempIndex, search::new::tests::collect_field_values, Criterion, Search,
|
|
|
|
SearchResult, TermsMatchingStrategy,
|
|
|
|
};
|
|
|
|
|
|
|
|
fn create_simple_index() -> TempIndex {
|
|
|
|
let index = TempIndex::new();
|
|
|
|
|
|
|
|
index
|
|
|
|
.update_settings(|s| {
|
|
|
|
s.set_primary_key("id".to_owned());
|
|
|
|
s.set_searchable_fields(vec!["text".to_owned()]);
|
|
|
|
s.set_criteria(vec![Criterion::Words, Criterion::Proximity]);
|
|
|
|
})
|
|
|
|
.unwrap();
|
|
|
|
|
|
|
|
index
|
|
|
|
.add_documents(documents!([
|
|
|
|
{
|
|
|
|
"id": 0,
|
|
|
|
"text": "the very quick dark brown and smart fox did jump over the terribly lazy and small dog"
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"id": 1,
|
|
|
|
"text": "the. quick brown fox jumps over the lazy. dog"
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"id": 2,
|
|
|
|
"text": "the quick brown fox jumps over the lazy. dog"
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"id": 3,
|
|
|
|
"text": "dog the quick brown fox jumps over the lazy"
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"id": 4,
|
|
|
|
"text": "the quickbrown fox jumps over the lazy dog"
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"id": 5,
|
|
|
|
"text": "brown quick fox jumps over the lazy dog"
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"id": 6,
|
|
|
|
"text": "the really quick brown fox jumps over the very lazy dog"
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"id": 7,
|
|
|
|
"text": "the really quick brown fox jumps over the lazy dog"
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"id": 8,
|
|
|
|
"text": "the quick brown fox jumps over the lazy"
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"id": 9,
|
|
|
|
"text": "the quack brown fox jumps over the lazy"
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"id": 9,
|
|
|
|
"text": "the quack brown fox jumps over the lazy dog"
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"id": 10,
|
|
|
|
"text": "the quick brown fox jumps over the lazy dog"
|
|
|
|
}
|
|
|
|
]))
|
|
|
|
.unwrap();
|
|
|
|
index
|
|
|
|
}
|
|
|
|
|
|
|
|
fn create_edge_cases_index() -> TempIndex {
|
|
|
|
let index = TempIndex::new();
|
|
|
|
|
|
|
|
index
|
|
|
|
.update_settings(|s| {
|
|
|
|
s.set_primary_key("id".to_owned());
|
|
|
|
s.set_searchable_fields(vec!["text".to_owned()]);
|
|
|
|
s.set_criteria(vec![Criterion::Words, Criterion::Proximity]);
|
|
|
|
})
|
|
|
|
.unwrap();
|
|
|
|
|
|
|
|
index.add_documents(documents!([
|
|
|
|
{
|
|
|
|
// This document will insert "s" in the prefix database
|
|
|
|
"id": 0,
|
|
|
|
"text": "
|
|
|
|
saa sab sac sae saf sag sah sai saj sak sal sam san sao sap saq sar sasa sat sau sav saw sax say saz
|
|
|
|
sba sbb sbc sbe sbf sbg sbh sbi sbj sbk sbl sbm sbn sbo sbp sbq sbr sbsb sbt sbu sbv sbw sbx sby sbz
|
|
|
|
sca scb scc sce scf scg sch sci scj sck scl scm scn sco scp scq scr scsc sct scu scv scw scx scy scz
|
|
|
|
sda sdb sdc sde sdf sdg sdh sdi sdj sdk sdl sdm sdn sdo sdp sdq sdr sdsd sdt sdu sdv sdw sdx sdy sdz
|
|
|
|
sea seb sec see sef seg seh sei sej sek sel sem sen seo sep seq ser sese set seu sev sew sex sey sez
|
|
|
|
sfa sfb sfc sfe sff sfg sfh sfi sfj sfk sfl sfm sfn sfo sfp sfq sfr sfsf sft sfu sfv sfw sfx sfy sfz
|
|
|
|
sga sgb sgc sge sgf sgg sgh sgi sgj sgk sgl sgm sgn sgo sgp sgq sgr sgsg sgt sgu sgv sgw sgx sgy sgz
|
|
|
|
ska skb skc ske skf skg skh ski skj skk skl skm skn sko skp skq skr sksk skt sku skv skw skx sky skz
|
|
|
|
sla slb slc sle slf slg slh sli slj slk sll slm sln slo slp slq slr slsl slt slu slv slw slx sly slz
|
|
|
|
sma smb smc sme smf smg smh smi smj smk sml smm smn smo smp smq smr smsm smt smu smv smw smx smy smz
|
|
|
|
sna snb snc sne snf sng snh sni snj snk snl snm snn sno snp snq snr snsn snt snu snv snw snx sny snz
|
|
|
|
soa sob soc soe sof sog soh soi soj sok sol som son soo sop soq sor soso sot sou sov sow sox soy soz
|
|
|
|
spa spb spc spe spf spg sph spi spj spk spl spm spn spo spp spq spr spsp spt spu spv spw spx spy spz
|
|
|
|
sqa sqb sqc sqe sqf sqg sqh sqi sqj sqk sql sqm sqn sqo sqp sqq sqr sqsq sqt squ sqv sqw sqx sqy sqz
|
|
|
|
sra srb src sre srf srg srh sri srj srk srl srm srn sro srp srq srr srsr srt sru srv srw srx sry srz
|
|
|
|
ssa ssb ssc sse ssf ssg ssh ssi ssj ssk ssl ssm ssn sso ssp ssq ssr ssss sst ssu ssv ssw ssx ssy ssz
|
|
|
|
sta stb stc ste stf stg sth sti stj stk stl stm stn sto stp stq str stst stt stu stv stw stx sty stz
|
|
|
|
"
|
|
|
|
},
|
|
|
|
// The next 5 documents lay out a trap with the split word, phrase search, or synonym `sun flower`.
|
|
|
|
// If the search query is "sunflower", the split word "Sun Flower" will match some documents.
|
|
|
|
// If the query is `sunflower wilting`, then we should make sure that
|
|
|
|
// the proximity condition `flower wilting: prox N` also comes with the condition
|
|
|
|
// `sun wilting: prox N+1`. TODO: this is not the exact condition we use for now.
|
|
|
|
// We only check that the phrase `sun flower` exists and `flower wilting: prox N`, which
|
|
|
|
// is better than nothing but not the best.
|
|
|
|
{
|
|
|
|
"id": 1,
|
|
|
|
"text": "Sun Flower sounds like the title of a painting, maybe about a plant wilting under the heat."
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"id": 2,
|
|
|
|
"text": "Sun Flower sounds like the title of a painting, maybe about a flower wilting under the heat."
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"id": 3,
|
|
|
|
// This document matches the query `sunflower wilting`, but the proximity condition
|
|
|
|
// between `sunflower` and `wilting` cannot be through the split-word `Sun Flower`
|
|
|
|
// which would reduce to only `flower` and `wilting` being in proximity.
|
|
|
|
"text": "A flower wilting under the sun, unlike a sunflower"
|
|
|
|
},
|
|
|
|
{
|
|
|
|
// This should be the best document for `sunflower wilting`
|
|
|
|
"id": 4,
|
|
|
|
"text": "sun flower wilting under the heat"
|
|
|
|
},
|
|
|
|
{
|
|
|
|
// This is also the best document for `sunflower wilting`
|
|
|
|
"id": 5,
|
|
|
|
"text": "sunflower wilting under the heat"
|
|
|
|
},
|
|
|
|
{
|
|
|
|
// Prox MAX between `best` and `s` prefix
|
|
|
|
"id": 6,
|
|
|
|
"text": "this is the best meal I have ever had in such a beautiful summer day"
|
|
|
|
},
|
|
|
|
{
|
|
|
|
// Prox 5 between `best` and `s` prefix
|
|
|
|
"id": 7,
|
|
|
|
"text": "this is the best cooked meal of the summer"
|
|
|
|
},
|
|
|
|
{
|
|
|
|
// Prox 4 between `best` and `s` prefix
|
|
|
|
"id": 8,
|
|
|
|
"text": "this is the best meal of the summer"
|
|
|
|
},
|
|
|
|
{
|
|
|
|
// Prox 3 between `best` and `s` prefix
|
|
|
|
"id": 9,
|
|
|
|
"text": "this is the best meal of summer"
|
|
|
|
},
|
|
|
|
{
|
|
|
|
// Prox 1 between `best` and `s` prefix
|
|
|
|
"id": 10,
|
|
|
|
"text": "this is the best summer meal"
|
|
|
|
},
|
|
|
|
{
|
|
|
|
// Reverse Prox 3 between `best` and `s` prefix
|
|
|
|
"id": 11,
|
|
|
|
"text": "summer x y best"
|
|
|
|
},
|
|
|
|
{
|
|
|
|
// Reverse Prox 2 between `best` and `s` prefix
|
|
|
|
"id": 12,
|
|
|
|
"text": "summer x best"
|
|
|
|
},
|
|
|
|
{
|
|
|
|
// Reverse Prox 1 between `best` and `s` prefix
|
|
|
|
"id": 13,
|
|
|
|
"text": "summer best"
|
|
|
|
},
|
|
|
|
])).unwrap();
|
|
|
|
index
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn test_proximity_simple() {
|
|
|
|
let index = create_simple_index();
|
|
|
|
let txn = index.read_txn().unwrap();
|
|
|
|
|
|
|
|
let mut s = Search::new(&txn, &index);
|
|
|
|
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
|
|
|
s.query("the quick brown fox jumps over the lazy dog");
|
|
|
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
|
|
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[4, 9, 10, 7, 6, 5, 2, 3, 0, 1]");
|
|
|
|
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
|
|
|
insta::assert_debug_snapshot!(texts, @r###"
|
|
|
|
[
|
|
|
|
"\"the quickbrown fox jumps over the lazy dog\"",
|
|
|
|
"\"the quack brown fox jumps over the lazy dog\"",
|
|
|
|
"\"the quick brown fox jumps over the lazy dog\"",
|
|
|
|
"\"the really quick brown fox jumps over the lazy dog\"",
|
|
|
|
"\"the really quick brown fox jumps over the very lazy dog\"",
|
|
|
|
"\"brown quick fox jumps over the lazy dog\"",
|
|
|
|
"\"the quick brown fox jumps over the lazy. dog\"",
|
|
|
|
"\"dog the quick brown fox jumps over the lazy\"",
|
|
|
|
"\"the very quick dark brown and smart fox did jump over the terribly lazy and small dog\"",
|
|
|
|
"\"the. quick brown fox jumps over the lazy. dog\"",
|
|
|
|
]
|
|
|
|
"###);
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn test_proximity_split_word() {
|
|
|
|
let index = create_edge_cases_index();
|
|
|
|
let txn = index.read_txn().unwrap();
|
|
|
|
|
|
|
|
let mut s = Search::new(&txn, &index);
|
|
|
|
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
|
|
|
s.query("sunflower wilting");
|
|
|
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
|
|
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 4, 5, 1, 3]");
|
|
|
|
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
|
|
|
// TODO: "2" and "4" should be swapped ideally
|
|
|
|
insta::assert_debug_snapshot!(texts, @r###"
|
|
|
|
[
|
|
|
|
"\"Sun Flower sounds like the title of a painting, maybe about a flower wilting under the heat.\"",
|
|
|
|
"\"sun flower wilting under the heat\"",
|
|
|
|
"\"sunflower wilting under the heat\"",
|
|
|
|
"\"Sun Flower sounds like the title of a painting, maybe about a plant wilting under the heat.\"",
|
|
|
|
"\"A flower wilting under the sun, unlike a sunflower\"",
|
|
|
|
]
|
|
|
|
"###);
|
|
|
|
|
|
|
|
let mut s = Search::new(&txn, &index);
|
|
|
|
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
|
|
|
s.query("\"sun flower\" wilting");
|
|
|
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
|
|
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 4, 1]");
|
|
|
|
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
|
|
|
// TODO: "2" and "4" should be swapped ideally
|
|
|
|
insta::assert_debug_snapshot!(texts, @r###"
|
|
|
|
[
|
|
|
|
"\"Sun Flower sounds like the title of a painting, maybe about a flower wilting under the heat.\"",
|
|
|
|
"\"sun flower wilting under the heat\"",
|
|
|
|
"\"Sun Flower sounds like the title of a painting, maybe about a plant wilting under the heat.\"",
|
|
|
|
]
|
|
|
|
"###);
|
|
|
|
drop(txn);
|
|
|
|
|
|
|
|
index
|
|
|
|
.update_settings(|s| {
|
|
|
|
let mut syns = HashMap::new();
|
|
|
|
syns.insert("xyz".to_owned(), vec!["sun flower".to_owned()]);
|
|
|
|
s.set_synonyms(syns);
|
|
|
|
})
|
|
|
|
.unwrap();
|
|
|
|
let txn = index.read_txn().unwrap();
|
|
|
|
|
|
|
|
let mut s = Search::new(&txn, &index);
|
|
|
|
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
|
|
|
s.query("xyz wilting");
|
|
|
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
|
|
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 4, 1]");
|
|
|
|
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
|
|
|
// TODO: "2" and "4" should be swapped ideally
|
|
|
|
insta::assert_debug_snapshot!(texts, @r###"
|
|
|
|
[
|
|
|
|
"\"Sun Flower sounds like the title of a painting, maybe about a flower wilting under the heat.\"",
|
|
|
|
"\"sun flower wilting under the heat\"",
|
|
|
|
"\"Sun Flower sounds like the title of a painting, maybe about a plant wilting under the heat.\"",
|
|
|
|
]
|
|
|
|
"###);
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn test_proximity_prefix_db() {
|
|
|
|
let index = create_edge_cases_index();
|
|
|
|
let txn = index.read_txn().unwrap();
|
|
|
|
|
|
|
|
let mut s = Search::new(&txn, &index);
|
|
|
|
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
|
|
|
s.query("best s");
|
|
|
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
|
|
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 13, 9, 12, 8, 6, 7, 11]");
|
|
|
|
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
|
|
|
|
|
|
|
// This test illustrates the loss of precision from using the prefix DB
|
|
|
|
insta::assert_debug_snapshot!(texts, @r###"
|
|
|
|
[
|
|
|
|
"\"this is the best summer meal\"",
|
|
|
|
"\"summer best\"",
|
|
|
|
"\"this is the best meal of summer\"",
|
|
|
|
"\"summer x best\"",
|
|
|
|
"\"this is the best meal of the summer\"",
|
|
|
|
"\"this is the best meal I have ever had in such a beautiful summer day\"",
|
|
|
|
"\"this is the best cooked meal of the summer\"",
|
|
|
|
"\"summer x y best\"",
|
|
|
|
]
|
|
|
|
"###);
|
|
|
|
}
|