meilisearch/milli/src/search/new/tests/proximity.rs

318 lines
13 KiB
Rust
Raw Normal View History

2023-04-05 17:20:04 +08:00
/*!
This module tests the Proximity ranking rule:
1. A proximity of >7 always has the same cost.
2. Phrase terms can be in proximity to other terms via their start and end words,
but we need to make sure that the phrase exists in the document that meets this
proximity condition. This is especially relevant with split words and synonyms.
3. An ngram has the same proximity cost as its component words being consecutive.
e.g. `sunflower` equivalent to `sun flower`.
4. The prefix databases can be used to find the proximity between two words, but
they store fewer proximities than the regular word proximity DB.
*/
use std::collections::HashMap;
use crate::{
index::tests::TempIndex, search::new::tests::collect_field_values, Criterion, Search,
SearchResult, TermsMatchingStrategy,
};
fn create_simple_index() -> TempIndex {
let index = TempIndex::new();
index
.update_settings(|s| {
s.set_primary_key("id".to_owned());
s.set_searchable_fields(vec!["text".to_owned()]);
s.set_criteria(vec![Criterion::Words, Criterion::Proximity]);
})
.unwrap();
index
.add_documents(documents!([
{
"id": 0,
"text": "the very quick dark brown and smart fox did jump over the terribly lazy and small dog"
},
{
"id": 1,
"text": "the. quick brown fox jumps over the lazy. dog"
},
{
"id": 2,
"text": "the quick brown fox jumps over the lazy. dog"
},
{
"id": 3,
"text": "dog the quick brown fox jumps over the lazy"
},
{
"id": 4,
"text": "the quickbrown fox jumps over the lazy dog"
},
{
"id": 5,
"text": "brown quick fox jumps over the lazy dog"
},
{
"id": 6,
"text": "the really quick brown fox jumps over the very lazy dog"
},
{
"id": 7,
"text": "the really quick brown fox jumps over the lazy dog"
},
{
"id": 8,
"text": "the quick brown fox jumps over the lazy"
},
{
"id": 9,
"text": "the quack brown fox jumps over the lazy"
},
{
"id": 9,
"text": "the quack brown fox jumps over the lazy dog"
},
{
"id": 10,
"text": "the quick brown fox jumps over the lazy dog"
}
]))
.unwrap();
index
}
fn create_edge_cases_index() -> TempIndex {
let index = TempIndex::new();
index
.update_settings(|s| {
s.set_primary_key("id".to_owned());
s.set_searchable_fields(vec!["text".to_owned()]);
s.set_criteria(vec![Criterion::Words, Criterion::Proximity]);
})
.unwrap();
index.add_documents(documents!([
{
// This document will insert "s" in the prefix database
"id": 0,
"text": "
saa sab sac sae saf sag sah sai saj sak sal sam san sao sap saq sar sasa sat sau sav saw sax say saz
sba sbb sbc sbe sbf sbg sbh sbi sbj sbk sbl sbm sbn sbo sbp sbq sbr sbsb sbt sbu sbv sbw sbx sby sbz
sca scb scc sce scf scg sch sci scj sck scl scm scn sco scp scq scr scsc sct scu scv scw scx scy scz
sda sdb sdc sde sdf sdg sdh sdi sdj sdk sdl sdm sdn sdo sdp sdq sdr sdsd sdt sdu sdv sdw sdx sdy sdz
sea seb sec see sef seg seh sei sej sek sel sem sen seo sep seq ser sese set seu sev sew sex sey sez
sfa sfb sfc sfe sff sfg sfh sfi sfj sfk sfl sfm sfn sfo sfp sfq sfr sfsf sft sfu sfv sfw sfx sfy sfz
sga sgb sgc sge sgf sgg sgh sgi sgj sgk sgl sgm sgn sgo sgp sgq sgr sgsg sgt sgu sgv sgw sgx sgy sgz
ska skb skc ske skf skg skh ski skj skk skl skm skn sko skp skq skr sksk skt sku skv skw skx sky skz
sla slb slc sle slf slg slh sli slj slk sll slm sln slo slp slq slr slsl slt slu slv slw slx sly slz
sma smb smc sme smf smg smh smi smj smk sml smm smn smo smp smq smr smsm smt smu smv smw smx smy smz
sna snb snc sne snf sng snh sni snj snk snl snm snn sno snp snq snr snsn snt snu snv snw snx sny snz
soa sob soc soe sof sog soh soi soj sok sol som son soo sop soq sor soso sot sou sov sow sox soy soz
spa spb spc spe spf spg sph spi spj spk spl spm spn spo spp spq spr spsp spt spu spv spw spx spy spz
sqa sqb sqc sqe sqf sqg sqh sqi sqj sqk sql sqm sqn sqo sqp sqq sqr sqsq sqt squ sqv sqw sqx sqy sqz
sra srb src sre srf srg srh sri srj srk srl srm srn sro srp srq srr srsr srt sru srv srw srx sry srz
ssa ssb ssc sse ssf ssg ssh ssi ssj ssk ssl ssm ssn sso ssp ssq ssr ssss sst ssu ssv ssw ssx ssy ssz
sta stb stc ste stf stg sth sti stj stk stl stm stn sto stp stq str stst stt stu stv stw stx sty stz
"
},
// The next 5 documents lay out a trap with the split word, phrase search, or synonym `sun flower`.
// If the search query is "sunflower", the split word "Sun Flower" will match some documents.
// If the query is `sunflower wilting`, then we should make sure that
// the proximity condition `flower wilting: prox N` also comes with the condition
// `sun wilting: prox N+1`. TODO: this is not the exact condition we use for now.
// We only check that the phrase `sun flower` exists and `flower wilting: prox N`, which
// is better than nothing but not the best.
{
"id": 1,
"text": "Sun Flower sounds like the title of a painting, maybe about a plant wilting under the heat."
},
{
"id": 2,
"text": "Sun Flower sounds like the title of a painting, maybe about a flower wilting under the heat."
},
{
"id": 3,
// This document matches the query `sunflower wilting`, but the proximity condition
// between `sunflower` and `wilting` cannot be through the split-word `Sun Flower`
// which would reduce to only `flower` and `wilting` being in proximity.
"text": "A flower wilting under the sun, unlike a sunflower"
},
{
// This should be the best document for `sunflower wilting`
"id": 4,
"text": "sun flower wilting under the heat"
},
{
// This is also the best document for `sunflower wilting`
"id": 5,
"text": "sunflower wilting under the heat"
},
{
// Prox MAX between `best` and `s` prefix
"id": 6,
"text": "this is the best meal I have ever had in such a beautiful summer day"
},
{
// Prox 5 between `best` and `s` prefix
"id": 7,
"text": "this is the best cooked meal of the summer"
},
{
// Prox 4 between `best` and `s` prefix
"id": 8,
"text": "this is the best meal of the summer"
},
{
// Prox 3 between `best` and `s` prefix
"id": 9,
"text": "this is the best meal of summer"
},
{
// Prox 1 between `best` and `s` prefix
"id": 10,
"text": "this is the best summer meal"
},
{
// Reverse Prox 3 between `best` and `s` prefix
"id": 11,
"text": "summer x y best"
},
{
// Reverse Prox 2 between `best` and `s` prefix
"id": 12,
"text": "summer x best"
},
{
// Reverse Prox 1 between `best` and `s` prefix
"id": 13,
"text": "summer best"
},
])).unwrap();
index
}
#[test]
fn test_proximity_simple() {
let index = create_simple_index();
let txn = index.read_txn().unwrap();
let mut s = Search::new(&txn, &index);
s.terms_matching_strategy(TermsMatchingStrategy::All);
s.query("the quick brown fox jumps over the lazy dog");
let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[4, 9, 10, 7, 6, 5, 2, 3, 0, 1]");
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
insta::assert_debug_snapshot!(texts, @r###"
[
"\"the quickbrown fox jumps over the lazy dog\"",
"\"the quack brown fox jumps over the lazy dog\"",
"\"the quick brown fox jumps over the lazy dog\"",
"\"the really quick brown fox jumps over the lazy dog\"",
"\"the really quick brown fox jumps over the very lazy dog\"",
"\"brown quick fox jumps over the lazy dog\"",
"\"the quick brown fox jumps over the lazy. dog\"",
"\"dog the quick brown fox jumps over the lazy\"",
"\"the very quick dark brown and smart fox did jump over the terribly lazy and small dog\"",
"\"the. quick brown fox jumps over the lazy. dog\"",
]
"###);
}
#[test]
fn test_proximity_split_word() {
let index = create_edge_cases_index();
let txn = index.read_txn().unwrap();
let mut s = Search::new(&txn, &index);
s.terms_matching_strategy(TermsMatchingStrategy::All);
s.query("sunflower wilting");
let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 4, 5, 1, 3]");
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
// TODO: "2" and "4" should be swapped ideally
insta::assert_debug_snapshot!(texts, @r###"
[
"\"Sun Flower sounds like the title of a painting, maybe about a flower wilting under the heat.\"",
"\"sun flower wilting under the heat\"",
"\"sunflower wilting under the heat\"",
"\"Sun Flower sounds like the title of a painting, maybe about a plant wilting under the heat.\"",
"\"A flower wilting under the sun, unlike a sunflower\"",
]
"###);
let mut s = Search::new(&txn, &index);
s.terms_matching_strategy(TermsMatchingStrategy::All);
s.query("\"sun flower\" wilting");
let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 4, 1]");
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
// TODO: "2" and "4" should be swapped ideally
insta::assert_debug_snapshot!(texts, @r###"
[
"\"Sun Flower sounds like the title of a painting, maybe about a flower wilting under the heat.\"",
"\"sun flower wilting under the heat\"",
"\"Sun Flower sounds like the title of a painting, maybe about a plant wilting under the heat.\"",
]
"###);
drop(txn);
index
.update_settings(|s| {
let mut syns = HashMap::new();
syns.insert("xyz".to_owned(), vec!["sun flower".to_owned()]);
s.set_synonyms(syns);
})
.unwrap();
let txn = index.read_txn().unwrap();
let mut s = Search::new(&txn, &index);
s.terms_matching_strategy(TermsMatchingStrategy::All);
s.query("xyz wilting");
let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 4, 1]");
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
// TODO: "2" and "4" should be swapped ideally
insta::assert_debug_snapshot!(texts, @r###"
[
"\"Sun Flower sounds like the title of a painting, maybe about a flower wilting under the heat.\"",
"\"sun flower wilting under the heat\"",
"\"Sun Flower sounds like the title of a painting, maybe about a plant wilting under the heat.\"",
]
"###);
}
#[test]
fn test_proximity_prefix_db() {
let index = create_edge_cases_index();
let txn = index.read_txn().unwrap();
let mut s = Search::new(&txn, &index);
s.terms_matching_strategy(TermsMatchingStrategy::All);
s.query("best s");
let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 13, 9, 12, 8, 6, 7, 11]");
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
// This test illustrates the loss of precision from using the prefix DB
insta::assert_debug_snapshot!(texts, @r###"
[
"\"this is the best summer meal\"",
"\"summer best\"",
"\"this is the best meal of summer\"",
"\"summer x best\"",
"\"this is the best meal of the summer\"",
"\"this is the best meal I have ever had in such a beautiful summer day\"",
"\"this is the best cooked meal of the summer\"",
"\"summer x y best\"",
]
"###);
}