From abdc4afcca564b5227c6ba3735fac3e5ea82ec48 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 29 May 2024 11:06:39 +0200 Subject: [PATCH 1/3] Implement Frequency matching strategy --- milli/src/search/mod.rs | 2 + .../search/new/graph_based_ranking_rule.rs | 15 +++++ milli/src/search/new/mod.rs | 5 ++ milli/src/search/new/query_graph.rs | 56 ++++++++++++++++++- milli/tests/search/mod.rs | 1 + 5 files changed, 78 insertions(+), 1 deletion(-) diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index ca0eda49e..c85b80d2f 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -276,6 +276,8 @@ pub enum TermsMatchingStrategy { Last, // all words are mandatory All, + // remove more frequent word first + Frequency, } impl Default for TermsMatchingStrategy { diff --git a/milli/src/search/new/graph_based_ranking_rule.rs b/milli/src/search/new/graph_based_ranking_rule.rs index 3136eb190..b066f82bd 100644 --- a/milli/src/search/new/graph_based_ranking_rule.rs +++ b/milli/src/search/new/graph_based_ranking_rule.rs @@ -164,6 +164,21 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase } costs } + TermsMatchingStrategy::Frequency => { + let removal_order = + query_graph.removal_order_for_terms_matching_strategy_frequency(ctx)?; + let mut forbidden_nodes = + SmallBitmap::for_interned_values_in(&query_graph.nodes); + let mut costs = query_graph.nodes.map(|_| None); + // FIXME: this works because only words uses termsmatchingstrategy at the moment. + for ns in removal_order { + for n in ns.iter() { + *costs.get_mut(n) = Some((1, forbidden_nodes.clone())); + } + forbidden_nodes.union(&ns); + } + costs + } TermsMatchingStrategy::All => query_graph.nodes.map(|_| None), } } else { diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 5e4c2f829..f178b03cf 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -197,6 +197,11 @@ fn resolve_maximally_reduced_query_graph( .iter() .flat_map(|x| x.iter()) .collect(), + TermsMatchingStrategy::Frequency => query_graph + .removal_order_for_terms_matching_strategy_frequency(ctx)? + .iter() + .flat_map(|x| x.iter()) + .collect(), TermsMatchingStrategy::All => vec![], }; graph.remove_nodes_keep_edges(&nodes_to_remove); diff --git a/milli/src/search/new/query_graph.rs b/milli/src/search/new/query_graph.rs index d34d0afb5..9cbe55aff 100644 --- a/milli/src/search/new/query_graph.rs +++ b/milli/src/search/new/query_graph.rs @@ -3,6 +3,7 @@ use std::collections::BTreeMap; use std::hash::{Hash, Hasher}; use fxhash::{FxHashMap, FxHasher}; +use roaring::RoaringBitmap; use super::interner::{FixedSizeInterner, Interned}; use super::query_term::{ @@ -11,6 +12,7 @@ use super::query_term::{ use super::small_bitmap::SmallBitmap; use super::SearchContext; use crate::search::new::interner::Interner; +use crate::search::new::resolve_query_graph::compute_query_term_subset_docids; use crate::Result; /// A node of the [`QueryGraph`]. @@ -290,6 +292,49 @@ impl QueryGraph { } } + pub fn removal_order_for_terms_matching_strategy_frequency( + &self, + ctx: &mut SearchContext, + ) -> Result>> { + // lookup frequency for each term + let mut term_with_frequency: Vec<(u8, u64)> = { + let mut term_docids: BTreeMap = Default::default(); + for (_, node) in self.nodes.iter() { + match &node.data { + QueryNodeData::Term(t) => { + let docids = compute_query_term_subset_docids(ctx, &t.term_subset)?; + for id in t.term_ids.clone() { + term_docids + .entry(id) + .and_modify(|curr| *curr |= &docids) + .or_insert_with(|| docids.clone()); + } + } + QueryNodeData::Deleted | QueryNodeData::Start | QueryNodeData::End => continue, + } + } + term_docids + .into_iter() + .map(|(idx, docids)| match docids.len() { + 0 => (idx, u64::max_value()), + frequency => (idx, frequency), + }) + .collect() + }; + term_with_frequency.sort_by_key(|(_, frequency)| *frequency); + let mut term_weight = BTreeMap::new(); + let mut weight: u16 = 1; + let mut peekable = term_with_frequency.into_iter().peekable(); + while let Some((idx, frequency)) = peekable.next() { + term_weight.insert(idx, weight); + if peekable.peek().map_or(false, |(_, f)| frequency < *f) { + weight += 1; + } + } + let cost_of_term_idx = move |term_idx: u8| *term_weight.get(&term_idx).unwrap(); + Ok(self.removal_order_for_terms_matching_strategy(ctx, cost_of_term_idx)) + } + pub fn removal_order_for_terms_matching_strategy_last( &self, ctx: &SearchContext, @@ -315,10 +360,19 @@ impl QueryGraph { if first_term_idx >= last_term_idx { return vec![]; } + let cost_of_term_idx = |term_idx: u8| { let rank = 1 + last_term_idx - term_idx; rank as u16 }; + self.removal_order_for_terms_matching_strategy(ctx, cost_of_term_idx) + } + + pub fn removal_order_for_terms_matching_strategy( + &self, + ctx: &SearchContext, + order: impl Fn(u8) -> u16, + ) -> Vec> { let mut nodes_to_remove = BTreeMap::>::new(); let mut at_least_one_mandatory_term = false; for (node_id, node) in self.nodes.iter() { @@ -329,7 +383,7 @@ impl QueryGraph { } let mut cost = 0; for id in t.term_ids.clone() { - cost = std::cmp::max(cost, cost_of_term_idx(id)); + cost = std::cmp::max(cost, order(id)); } nodes_to_remove .entry(cost) diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs index 9193ab762..310780e03 100644 --- a/milli/tests/search/mod.rs +++ b/milli/tests/search/mod.rs @@ -159,6 +159,7 @@ pub fn expected_order( match optional_words { TermsMatchingStrategy::Last => groups.into_iter().flatten().collect(), + TermsMatchingStrategy::Frequency => groups.into_iter().flatten().collect(), TermsMatchingStrategy::All => { groups.into_iter().flatten().filter(|d| d.word_rank == 0).collect() } From 6a4b2516aa5ecb46d726c129c3adc531fb3c488c Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 29 May 2024 16:21:24 +0200 Subject: [PATCH 2/3] WIP --- meilisearch/src/search.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/meilisearch/src/search.rs b/meilisearch/src/search.rs index 34ebe463d..926f0e72b 100644 --- a/meilisearch/src/search.rs +++ b/meilisearch/src/search.rs @@ -424,6 +424,8 @@ pub enum MatchingStrategy { Last, /// All query words are mandatory All, + /// Remove query words from the most frequent to the least + Frequency, } impl Default for MatchingStrategy { @@ -437,6 +439,7 @@ impl From for TermsMatchingStrategy { match other { MatchingStrategy::Last => Self::Last, MatchingStrategy::All => Self::All, + MatchingStrategy::Frequency => Self::Frequency, } } } From 3f1a510069a616026db45329784142e3e7d60930 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 30 May 2024 12:02:42 +0200 Subject: [PATCH 3/3] Add tests and fix matching strategy --- meilisearch/tests/search/errors.rs | 4 +- meilisearch/tests/search/matching_strategy.rs | 128 ++++++++++++++++++ meilisearch/tests/search/mod.rs | 1 + milli/src/search/new/query_graph.rs | 6 +- 4 files changed, 134 insertions(+), 5 deletions(-) create mode 100644 meilisearch/tests/search/matching_strategy.rs diff --git a/meilisearch/tests/search/errors.rs b/meilisearch/tests/search/errors.rs index 8be70d162..cce1a86e7 100644 --- a/meilisearch/tests/search/errors.rs +++ b/meilisearch/tests/search/errors.rs @@ -505,7 +505,7 @@ async fn search_bad_matching_strategy() { snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r###" { - "message": "Unknown value `doggo` at `.matchingStrategy`: expected one of `last`, `all`", + "message": "Unknown value `doggo` at `.matchingStrategy`: expected one of `last`, `all`, `frequency`", "code": "invalid_search_matching_strategy", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_search_matching_strategy" @@ -527,7 +527,7 @@ async fn search_bad_matching_strategy() { snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r###" { - "message": "Unknown value `doggo` for parameter `matchingStrategy`: expected one of `last`, `all`", + "message": "Unknown value `doggo` for parameter `matchingStrategy`: expected one of `last`, `all`, `frequency`", "code": "invalid_search_matching_strategy", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_search_matching_strategy" diff --git a/meilisearch/tests/search/matching_strategy.rs b/meilisearch/tests/search/matching_strategy.rs new file mode 100644 index 000000000..a4cb19f62 --- /dev/null +++ b/meilisearch/tests/search/matching_strategy.rs @@ -0,0 +1,128 @@ +use meili_snap::snapshot; +use once_cell::sync::Lazy; + +use crate::common::index::Index; +use crate::common::{Server, Value}; +use crate::json; + +async fn index_with_documents<'a>(server: &'a Server, documents: &Value) -> Index<'a> { + let index = server.index("test"); + + index.add_documents(documents.clone(), None).await; + index.wait_task(0).await; + index +} + +static SIMPLE_SEARCH_DOCUMENTS: Lazy = Lazy::new(|| { + json!([ + { + "title": "Shazam!", + "id": "1", + }, + { + "title": "Captain Planet", + "id": "2", + }, + { + "title": "Captain Marvel", + "id": "3", + }, + { + "title": "a Captain Marvel ersatz", + "id": "4" + }, + { + "title": "He's not part of the Marvel Cinematic Universe", + "id": "5" + }, + { + "title": "a Shazam ersatz, but better than Captain Planet", + "id": "6" + }, + { + "title": "Capitain CAAAAAVEEERNE!!!!", + "id": "7" + } + ]) +}); + +#[actix_rt::test] +async fn simple_search() { + let server = Server::new().await; + let index = index_with_documents(&server, &SIMPLE_SEARCH_DOCUMENTS).await; + + index + .search(json!({"q": "Captain Marvel", "matchingStrategy": "last", "attributesToRetrieve": ["id"]}), |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(response["hits"], @r###"[{"id":"3"},{"id":"4"},{"id":"2"},{"id":"6"},{"id":"7"}]"###); + }) + .await; + + index + .search(json!({"q": "Captain Marvel", "matchingStrategy": "all", "attributesToRetrieve": ["id"]}), |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(response["hits"], @r###"[{"id":"3"},{"id":"4"}]"###); + }) + .await; + + index + .search(json!({"q": "Captain Marvel", "matchingStrategy": "frequency", "attributesToRetrieve": ["id"]}), |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(response["hits"], @r###"[{"id":"3"},{"id":"4"},{"id":"5"}]"###); + }) + .await; +} + +#[actix_rt::test] +async fn search_with_typo() { + let server = Server::new().await; + let index = index_with_documents(&server, &SIMPLE_SEARCH_DOCUMENTS).await; + + index + .search(json!({"q": "Capitain Marvel", "matchingStrategy": "last", "attributesToRetrieve": ["id"]}), |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(response["hits"], @r###"[{"id":"3"},{"id":"4"},{"id":"7"},{"id":"2"},{"id":"6"}]"###); + }) + .await; + + index + .search(json!({"q": "Capitain Marvel", "matchingStrategy": "all", "attributesToRetrieve": ["id"]}), |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(response["hits"], @r###"[{"id":"3"},{"id":"4"}]"###); + }) + .await; + + index + .search(json!({"q": "Capitain Marvel", "matchingStrategy": "frequency", "attributesToRetrieve": ["id"]}), |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(response["hits"], @r###"[{"id":"3"},{"id":"4"},{"id":"5"}]"###); + }) + .await; +} + +#[actix_rt::test] +async fn search_with_unknown_word() { + let server = Server::new().await; + let index = index_with_documents(&server, &SIMPLE_SEARCH_DOCUMENTS).await; + + index + .search(json!({"q": "Captain Supercopter Marvel", "matchingStrategy": "last", "attributesToRetrieve": ["id"]}), |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(response["hits"], @r###"[{"id":"2"},{"id":"3"},{"id":"4"},{"id":"6"},{"id":"7"}]"###); + }) + .await; + + index + .search(json!({"q": "Captain Supercopter Marvel", "matchingStrategy": "all", "attributesToRetrieve": ["id"]}), |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(response["hits"], @"[]"); + }) + .await; + + index + .search(json!({"q": "Captain Supercopter Marvel", "matchingStrategy": "frequency", "attributesToRetrieve": ["id"]}), |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(response["hits"], @r###"[{"id":"3"},{"id":"4"},{"id":"5"}]"###); + }) + .await; +} diff --git a/meilisearch/tests/search/mod.rs b/meilisearch/tests/search/mod.rs index 56fa226b2..284b68a15 100644 --- a/meilisearch/tests/search/mod.rs +++ b/meilisearch/tests/search/mod.rs @@ -7,6 +7,7 @@ mod facet_search; mod formatted; mod geo; mod hybrid; +mod matching_strategy; mod multi; mod pagination; mod restrict_searchable; diff --git a/milli/src/search/new/query_graph.rs b/milli/src/search/new/query_graph.rs index 9cbe55aff..cda767d75 100644 --- a/milli/src/search/new/query_graph.rs +++ b/milli/src/search/new/query_graph.rs @@ -1,4 +1,4 @@ -use std::cmp::Ordering; +use std::cmp::{Ordering, Reverse}; use std::collections::BTreeMap; use std::hash::{Hash, Hasher}; @@ -321,13 +321,13 @@ impl QueryGraph { }) .collect() }; - term_with_frequency.sort_by_key(|(_, frequency)| *frequency); + term_with_frequency.sort_by_key(|(_, frequency)| Reverse(*frequency)); let mut term_weight = BTreeMap::new(); let mut weight: u16 = 1; let mut peekable = term_with_frequency.into_iter().peekable(); while let Some((idx, frequency)) = peekable.next() { term_weight.insert(idx, weight); - if peekable.peek().map_or(false, |(_, f)| frequency < *f) { + if peekable.peek().map_or(false, |(_, f)| frequency != *f) { weight += 1; } }