mirror of
https://github.com/meilisearch/meilisearch.git
synced 2024-11-22 18:17:39 +08:00
Merge #4667
4667: Frequency matching strategy r=Kerollmops a=ManyTheFish # Pull Request ## Related issue Fixes #3773 ## What does this PR do? - add test for matching strategy - implement frequency matching strategy See the [PRD for more details](https://www.notion.so/meilisearch/Frequency-Matching-Strategy-0f3ba08833a442a39590a53a1505ab00). [Public API](https://www.notion.so/meilisearch/frequency-matching-strategy-89868fb7fc584026bc56e378eb854a7f). Co-authored-by: ManyTheFish <many@meilisearch.com>
This commit is contained in:
commit
d6bd88ce4f
@ -477,6 +477,8 @@ pub enum MatchingStrategy {
|
|||||||
Last,
|
Last,
|
||||||
/// All query words are mandatory
|
/// All query words are mandatory
|
||||||
All,
|
All,
|
||||||
|
/// Remove query words from the most frequent to the least
|
||||||
|
Frequency,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Default for MatchingStrategy {
|
impl Default for MatchingStrategy {
|
||||||
@ -490,6 +492,7 @@ impl From<MatchingStrategy> for TermsMatchingStrategy {
|
|||||||
match other {
|
match other {
|
||||||
MatchingStrategy::Last => Self::Last,
|
MatchingStrategy::Last => Self::Last,
|
||||||
MatchingStrategy::All => Self::All,
|
MatchingStrategy::All => Self::All,
|
||||||
|
MatchingStrategy::Frequency => Self::Frequency,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -505,7 +505,7 @@ async fn search_bad_matching_strategy() {
|
|||||||
snapshot!(code, @"400 Bad Request");
|
snapshot!(code, @"400 Bad Request");
|
||||||
snapshot!(json_string!(response), @r###"
|
snapshot!(json_string!(response), @r###"
|
||||||
{
|
{
|
||||||
"message": "Unknown value `doggo` at `.matchingStrategy`: expected one of `last`, `all`",
|
"message": "Unknown value `doggo` at `.matchingStrategy`: expected one of `last`, `all`, `frequency`",
|
||||||
"code": "invalid_search_matching_strategy",
|
"code": "invalid_search_matching_strategy",
|
||||||
"type": "invalid_request",
|
"type": "invalid_request",
|
||||||
"link": "https://docs.meilisearch.com/errors#invalid_search_matching_strategy"
|
"link": "https://docs.meilisearch.com/errors#invalid_search_matching_strategy"
|
||||||
@ -527,7 +527,7 @@ async fn search_bad_matching_strategy() {
|
|||||||
snapshot!(code, @"400 Bad Request");
|
snapshot!(code, @"400 Bad Request");
|
||||||
snapshot!(json_string!(response), @r###"
|
snapshot!(json_string!(response), @r###"
|
||||||
{
|
{
|
||||||
"message": "Unknown value `doggo` for parameter `matchingStrategy`: expected one of `last`, `all`",
|
"message": "Unknown value `doggo` for parameter `matchingStrategy`: expected one of `last`, `all`, `frequency`",
|
||||||
"code": "invalid_search_matching_strategy",
|
"code": "invalid_search_matching_strategy",
|
||||||
"type": "invalid_request",
|
"type": "invalid_request",
|
||||||
"link": "https://docs.meilisearch.com/errors#invalid_search_matching_strategy"
|
"link": "https://docs.meilisearch.com/errors#invalid_search_matching_strategy"
|
||||||
|
128
meilisearch/tests/search/matching_strategy.rs
Normal file
128
meilisearch/tests/search/matching_strategy.rs
Normal file
@ -0,0 +1,128 @@
|
|||||||
|
use meili_snap::snapshot;
|
||||||
|
use once_cell::sync::Lazy;
|
||||||
|
|
||||||
|
use crate::common::index::Index;
|
||||||
|
use crate::common::{Server, Value};
|
||||||
|
use crate::json;
|
||||||
|
|
||||||
|
async fn index_with_documents<'a>(server: &'a Server, documents: &Value) -> Index<'a> {
|
||||||
|
let index = server.index("test");
|
||||||
|
|
||||||
|
index.add_documents(documents.clone(), None).await;
|
||||||
|
index.wait_task(0).await;
|
||||||
|
index
|
||||||
|
}
|
||||||
|
|
||||||
|
static SIMPLE_SEARCH_DOCUMENTS: Lazy<Value> = Lazy::new(|| {
|
||||||
|
json!([
|
||||||
|
{
|
||||||
|
"title": "Shazam!",
|
||||||
|
"id": "1",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "Captain Planet",
|
||||||
|
"id": "2",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "Captain Marvel",
|
||||||
|
"id": "3",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "a Captain Marvel ersatz",
|
||||||
|
"id": "4"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "He's not part of the Marvel Cinematic Universe",
|
||||||
|
"id": "5"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "a Shazam ersatz, but better than Captain Planet",
|
||||||
|
"id": "6"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "Capitain CAAAAAVEEERNE!!!!",
|
||||||
|
"id": "7"
|
||||||
|
}
|
||||||
|
])
|
||||||
|
});
|
||||||
|
|
||||||
|
#[actix_rt::test]
|
||||||
|
async fn simple_search() {
|
||||||
|
let server = Server::new().await;
|
||||||
|
let index = index_with_documents(&server, &SIMPLE_SEARCH_DOCUMENTS).await;
|
||||||
|
|
||||||
|
index
|
||||||
|
.search(json!({"q": "Captain Marvel", "matchingStrategy": "last", "attributesToRetrieve": ["id"]}), |response, code| {
|
||||||
|
snapshot!(code, @"200 OK");
|
||||||
|
snapshot!(response["hits"], @r###"[{"id":"3"},{"id":"4"},{"id":"2"},{"id":"6"},{"id":"7"}]"###);
|
||||||
|
})
|
||||||
|
.await;
|
||||||
|
|
||||||
|
index
|
||||||
|
.search(json!({"q": "Captain Marvel", "matchingStrategy": "all", "attributesToRetrieve": ["id"]}), |response, code| {
|
||||||
|
snapshot!(code, @"200 OK");
|
||||||
|
snapshot!(response["hits"], @r###"[{"id":"3"},{"id":"4"}]"###);
|
||||||
|
})
|
||||||
|
.await;
|
||||||
|
|
||||||
|
index
|
||||||
|
.search(json!({"q": "Captain Marvel", "matchingStrategy": "frequency", "attributesToRetrieve": ["id"]}), |response, code| {
|
||||||
|
snapshot!(code, @"200 OK");
|
||||||
|
snapshot!(response["hits"], @r###"[{"id":"3"},{"id":"4"},{"id":"5"}]"###);
|
||||||
|
})
|
||||||
|
.await;
|
||||||
|
}
|
||||||
|
|
||||||
|
#[actix_rt::test]
|
||||||
|
async fn search_with_typo() {
|
||||||
|
let server = Server::new().await;
|
||||||
|
let index = index_with_documents(&server, &SIMPLE_SEARCH_DOCUMENTS).await;
|
||||||
|
|
||||||
|
index
|
||||||
|
.search(json!({"q": "Capitain Marvel", "matchingStrategy": "last", "attributesToRetrieve": ["id"]}), |response, code| {
|
||||||
|
snapshot!(code, @"200 OK");
|
||||||
|
snapshot!(response["hits"], @r###"[{"id":"3"},{"id":"4"},{"id":"7"},{"id":"2"},{"id":"6"}]"###);
|
||||||
|
})
|
||||||
|
.await;
|
||||||
|
|
||||||
|
index
|
||||||
|
.search(json!({"q": "Capitain Marvel", "matchingStrategy": "all", "attributesToRetrieve": ["id"]}), |response, code| {
|
||||||
|
snapshot!(code, @"200 OK");
|
||||||
|
snapshot!(response["hits"], @r###"[{"id":"3"},{"id":"4"}]"###);
|
||||||
|
})
|
||||||
|
.await;
|
||||||
|
|
||||||
|
index
|
||||||
|
.search(json!({"q": "Capitain Marvel", "matchingStrategy": "frequency", "attributesToRetrieve": ["id"]}), |response, code| {
|
||||||
|
snapshot!(code, @"200 OK");
|
||||||
|
snapshot!(response["hits"], @r###"[{"id":"3"},{"id":"4"},{"id":"5"}]"###);
|
||||||
|
})
|
||||||
|
.await;
|
||||||
|
}
|
||||||
|
|
||||||
|
#[actix_rt::test]
|
||||||
|
async fn search_with_unknown_word() {
|
||||||
|
let server = Server::new().await;
|
||||||
|
let index = index_with_documents(&server, &SIMPLE_SEARCH_DOCUMENTS).await;
|
||||||
|
|
||||||
|
index
|
||||||
|
.search(json!({"q": "Captain Supercopter Marvel", "matchingStrategy": "last", "attributesToRetrieve": ["id"]}), |response, code| {
|
||||||
|
snapshot!(code, @"200 OK");
|
||||||
|
snapshot!(response["hits"], @r###"[{"id":"2"},{"id":"3"},{"id":"4"},{"id":"6"},{"id":"7"}]"###);
|
||||||
|
})
|
||||||
|
.await;
|
||||||
|
|
||||||
|
index
|
||||||
|
.search(json!({"q": "Captain Supercopter Marvel", "matchingStrategy": "all", "attributesToRetrieve": ["id"]}), |response, code| {
|
||||||
|
snapshot!(code, @"200 OK");
|
||||||
|
snapshot!(response["hits"], @"[]");
|
||||||
|
})
|
||||||
|
.await;
|
||||||
|
|
||||||
|
index
|
||||||
|
.search(json!({"q": "Captain Supercopter Marvel", "matchingStrategy": "frequency", "attributesToRetrieve": ["id"]}), |response, code| {
|
||||||
|
snapshot!(code, @"200 OK");
|
||||||
|
snapshot!(response["hits"], @r###"[{"id":"3"},{"id":"4"},{"id":"5"}]"###);
|
||||||
|
})
|
||||||
|
.await;
|
||||||
|
}
|
@ -7,6 +7,7 @@ mod facet_search;
|
|||||||
mod formatted;
|
mod formatted;
|
||||||
mod geo;
|
mod geo;
|
||||||
mod hybrid;
|
mod hybrid;
|
||||||
|
mod matching_strategy;
|
||||||
mod multi;
|
mod multi;
|
||||||
mod pagination;
|
mod pagination;
|
||||||
mod restrict_searchable;
|
mod restrict_searchable;
|
||||||
|
@ -277,6 +277,8 @@ pub enum TermsMatchingStrategy {
|
|||||||
Last,
|
Last,
|
||||||
// all words are mandatory
|
// all words are mandatory
|
||||||
All,
|
All,
|
||||||
|
// remove more frequent word first
|
||||||
|
Frequency,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Default for TermsMatchingStrategy {
|
impl Default for TermsMatchingStrategy {
|
||||||
|
@ -164,6 +164,21 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase
|
|||||||
}
|
}
|
||||||
costs
|
costs
|
||||||
}
|
}
|
||||||
|
TermsMatchingStrategy::Frequency => {
|
||||||
|
let removal_order =
|
||||||
|
query_graph.removal_order_for_terms_matching_strategy_frequency(ctx)?;
|
||||||
|
let mut forbidden_nodes =
|
||||||
|
SmallBitmap::for_interned_values_in(&query_graph.nodes);
|
||||||
|
let mut costs = query_graph.nodes.map(|_| None);
|
||||||
|
// FIXME: this works because only words uses termsmatchingstrategy at the moment.
|
||||||
|
for ns in removal_order {
|
||||||
|
for n in ns.iter() {
|
||||||
|
*costs.get_mut(n) = Some((1, forbidden_nodes.clone()));
|
||||||
|
}
|
||||||
|
forbidden_nodes.union(&ns);
|
||||||
|
}
|
||||||
|
costs
|
||||||
|
}
|
||||||
TermsMatchingStrategy::All => query_graph.nodes.map(|_| None),
|
TermsMatchingStrategy::All => query_graph.nodes.map(|_| None),
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
@ -197,6 +197,11 @@ fn resolve_maximally_reduced_query_graph(
|
|||||||
.iter()
|
.iter()
|
||||||
.flat_map(|x| x.iter())
|
.flat_map(|x| x.iter())
|
||||||
.collect(),
|
.collect(),
|
||||||
|
TermsMatchingStrategy::Frequency => query_graph
|
||||||
|
.removal_order_for_terms_matching_strategy_frequency(ctx)?
|
||||||
|
.iter()
|
||||||
|
.flat_map(|x| x.iter())
|
||||||
|
.collect(),
|
||||||
TermsMatchingStrategy::All => vec![],
|
TermsMatchingStrategy::All => vec![],
|
||||||
};
|
};
|
||||||
graph.remove_nodes_keep_edges(&nodes_to_remove);
|
graph.remove_nodes_keep_edges(&nodes_to_remove);
|
||||||
|
@ -1,8 +1,9 @@
|
|||||||
use std::cmp::Ordering;
|
use std::cmp::{Ordering, Reverse};
|
||||||
use std::collections::BTreeMap;
|
use std::collections::BTreeMap;
|
||||||
use std::hash::{Hash, Hasher};
|
use std::hash::{Hash, Hasher};
|
||||||
|
|
||||||
use fxhash::{FxHashMap, FxHasher};
|
use fxhash::{FxHashMap, FxHasher};
|
||||||
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
use super::interner::{FixedSizeInterner, Interned};
|
use super::interner::{FixedSizeInterner, Interned};
|
||||||
use super::query_term::{
|
use super::query_term::{
|
||||||
@ -11,6 +12,7 @@ use super::query_term::{
|
|||||||
use super::small_bitmap::SmallBitmap;
|
use super::small_bitmap::SmallBitmap;
|
||||||
use super::SearchContext;
|
use super::SearchContext;
|
||||||
use crate::search::new::interner::Interner;
|
use crate::search::new::interner::Interner;
|
||||||
|
use crate::search::new::resolve_query_graph::compute_query_term_subset_docids;
|
||||||
use crate::Result;
|
use crate::Result;
|
||||||
|
|
||||||
/// A node of the [`QueryGraph`].
|
/// A node of the [`QueryGraph`].
|
||||||
@ -290,6 +292,49 @@ impl QueryGraph {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn removal_order_for_terms_matching_strategy_frequency(
|
||||||
|
&self,
|
||||||
|
ctx: &mut SearchContext,
|
||||||
|
) -> Result<Vec<SmallBitmap<QueryNode>>> {
|
||||||
|
// lookup frequency for each term
|
||||||
|
let mut term_with_frequency: Vec<(u8, u64)> = {
|
||||||
|
let mut term_docids: BTreeMap<u8, RoaringBitmap> = Default::default();
|
||||||
|
for (_, node) in self.nodes.iter() {
|
||||||
|
match &node.data {
|
||||||
|
QueryNodeData::Term(t) => {
|
||||||
|
let docids = compute_query_term_subset_docids(ctx, &t.term_subset)?;
|
||||||
|
for id in t.term_ids.clone() {
|
||||||
|
term_docids
|
||||||
|
.entry(id)
|
||||||
|
.and_modify(|curr| *curr |= &docids)
|
||||||
|
.or_insert_with(|| docids.clone());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
QueryNodeData::Deleted | QueryNodeData::Start | QueryNodeData::End => continue,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
term_docids
|
||||||
|
.into_iter()
|
||||||
|
.map(|(idx, docids)| match docids.len() {
|
||||||
|
0 => (idx, u64::max_value()),
|
||||||
|
frequency => (idx, frequency),
|
||||||
|
})
|
||||||
|
.collect()
|
||||||
|
};
|
||||||
|
term_with_frequency.sort_by_key(|(_, frequency)| Reverse(*frequency));
|
||||||
|
let mut term_weight = BTreeMap::new();
|
||||||
|
let mut weight: u16 = 1;
|
||||||
|
let mut peekable = term_with_frequency.into_iter().peekable();
|
||||||
|
while let Some((idx, frequency)) = peekable.next() {
|
||||||
|
term_weight.insert(idx, weight);
|
||||||
|
if peekable.peek().map_or(false, |(_, f)| frequency != *f) {
|
||||||
|
weight += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let cost_of_term_idx = move |term_idx: u8| *term_weight.get(&term_idx).unwrap();
|
||||||
|
Ok(self.removal_order_for_terms_matching_strategy(ctx, cost_of_term_idx))
|
||||||
|
}
|
||||||
|
|
||||||
pub fn removal_order_for_terms_matching_strategy_last(
|
pub fn removal_order_for_terms_matching_strategy_last(
|
||||||
&self,
|
&self,
|
||||||
ctx: &SearchContext,
|
ctx: &SearchContext,
|
||||||
@ -315,10 +360,19 @@ impl QueryGraph {
|
|||||||
if first_term_idx >= last_term_idx {
|
if first_term_idx >= last_term_idx {
|
||||||
return vec![];
|
return vec![];
|
||||||
}
|
}
|
||||||
|
|
||||||
let cost_of_term_idx = |term_idx: u8| {
|
let cost_of_term_idx = |term_idx: u8| {
|
||||||
let rank = 1 + last_term_idx - term_idx;
|
let rank = 1 + last_term_idx - term_idx;
|
||||||
rank as u16
|
rank as u16
|
||||||
};
|
};
|
||||||
|
self.removal_order_for_terms_matching_strategy(ctx, cost_of_term_idx)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn removal_order_for_terms_matching_strategy(
|
||||||
|
&self,
|
||||||
|
ctx: &SearchContext,
|
||||||
|
order: impl Fn(u8) -> u16,
|
||||||
|
) -> Vec<SmallBitmap<QueryNode>> {
|
||||||
let mut nodes_to_remove = BTreeMap::<u16, SmallBitmap<QueryNode>>::new();
|
let mut nodes_to_remove = BTreeMap::<u16, SmallBitmap<QueryNode>>::new();
|
||||||
let mut at_least_one_mandatory_term = false;
|
let mut at_least_one_mandatory_term = false;
|
||||||
for (node_id, node) in self.nodes.iter() {
|
for (node_id, node) in self.nodes.iter() {
|
||||||
@ -329,7 +383,7 @@ impl QueryGraph {
|
|||||||
}
|
}
|
||||||
let mut cost = 0;
|
let mut cost = 0;
|
||||||
for id in t.term_ids.clone() {
|
for id in t.term_ids.clone() {
|
||||||
cost = std::cmp::max(cost, cost_of_term_idx(id));
|
cost = std::cmp::max(cost, order(id));
|
||||||
}
|
}
|
||||||
nodes_to_remove
|
nodes_to_remove
|
||||||
.entry(cost)
|
.entry(cost)
|
||||||
|
@ -159,6 +159,7 @@ pub fn expected_order(
|
|||||||
|
|
||||||
match optional_words {
|
match optional_words {
|
||||||
TermsMatchingStrategy::Last => groups.into_iter().flatten().collect(),
|
TermsMatchingStrategy::Last => groups.into_iter().flatten().collect(),
|
||||||
|
TermsMatchingStrategy::Frequency => groups.into_iter().flatten().collect(),
|
||||||
TermsMatchingStrategy::All => {
|
TermsMatchingStrategy::All => {
|
||||||
groups.into_iter().flatten().filter(|d| d.word_rank == 0).collect()
|
groups.into_iter().flatten().filter(|d| d.word_rank == 0).collect()
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user