From 1ff2a2d6fb20c25a51f87869b833b17f307f659f Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 3 Apr 2024 09:35:07 +0200 Subject: [PATCH] Add semanticHitCount --- .../src/analytics/segment_analytics.rs | 1 + meilisearch/src/search.rs | 31 ++++--- meilisearch/tests/search/hybrid.rs | 30 +++++++ milli/src/search/hybrid.rs | 82 +++++++++++++------ 4 files changed, 108 insertions(+), 36 deletions(-) diff --git a/meilisearch/src/analytics/segment_analytics.rs b/meilisearch/src/analytics/segment_analytics.rs index fcf4d9144..c49a04576 100644 --- a/meilisearch/src/analytics/segment_analytics.rs +++ b/meilisearch/src/analytics/segment_analytics.rs @@ -760,6 +760,7 @@ impl SearchAggregator { query: _, processing_time_ms, hits_info: _, + semantic_hit_count: _, facet_distribution: _, facet_stats: _, degraded, diff --git a/meilisearch/src/search.rs b/meilisearch/src/search.rs index 7cb860f2e..85438e816 100644 --- a/meilisearch/src/search.rs +++ b/meilisearch/src/search.rs @@ -385,6 +385,9 @@ pub struct SearchResult { #[serde(skip_serializing_if = "Option::is_none")] pub facet_stats: Option>, + #[serde(skip_serializing_if = "Option::is_none")] + pub semantic_hit_count: Option, + // These fields are only used for analytics purposes #[serde(skip)] pub degraded: bool, @@ -553,16 +556,23 @@ pub fn perform_search( let (search, is_finite_pagination, max_total_hits, offset) = prepare_search(index, &rtxn, &query, &search_kind, time_budget)?; - let milli::SearchResult { - documents_ids, - matching_words, - candidates, - document_scores, - degraded, - used_negative_operator, - .. - } = match &search_kind { - SearchKind::KeywordOnly | SearchKind::SemanticOnly { .. } => search.execute()?, + let ( + milli::SearchResult { + documents_ids, + matching_words, + candidates, + document_scores, + degraded, + used_negative_operator, + }, + semantic_hit_count, + ) = match &search_kind { + SearchKind::KeywordOnly => (search.execute()?, None), + SearchKind::SemanticOnly { .. } => { + let results = search.execute()?; + let semantic_hit_count = results.document_scores.len() as u32; + (results, Some(semantic_hit_count)) + } SearchKind::Hybrid { semantic_ratio, .. } => search.execute_hybrid(*semantic_ratio)?, }; @@ -760,6 +770,7 @@ pub fn perform_search( facet_stats, degraded, used_negative_operator, + semantic_hit_count, }; Ok(result) } diff --git a/meilisearch/tests/search/hybrid.rs b/meilisearch/tests/search/hybrid.rs index 8decb7ded..77c4f30a3 100644 --- a/meilisearch/tests/search/hybrid.rs +++ b/meilisearch/tests/search/hybrid.rs @@ -77,6 +77,16 @@ async fn simple_search() { .await; snapshot!(code, @"200 OK"); snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]}},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]}},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]}}]"###); + snapshot!(response["semanticHitCount"], @"0"); + + let (response, code) = index + .search_post( + json!({"q": "Captain", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.5}}), + ) + .await; + snapshot!(code, @"200 OK"); + snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]}},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]}},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_semanticScore":0.9472136}]"###); + snapshot!(response["semanticHitCount"], @"1"); let (response, code) = index .search_post( @@ -85,6 +95,7 @@ async fn simple_search() { .await; snapshot!(code, @"200 OK"); snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_semanticScore":0.99029034},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_semanticScore":0.97434163},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_semanticScore":0.9472136}]"###); + snapshot!(response["semanticHitCount"], @"3"); } #[actix_rt::test] @@ -136,6 +147,7 @@ async fn highlighter() { .await; snapshot!(code, @"200 OK"); snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":["2.0","3.0"]}}},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_formatted":{"title":"Shazam!","desc":"a **BEGIN**Captain**END** **BEGIN**Marvel**END** ersatz","id":"1","_vectors":{"default":["1.0","3.0"]}}},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_formatted":{"title":"Captain Planet","desc":"He's not part of the **BEGIN**Marvel**END** Cinematic Universe","id":"2","_vectors":{"default":["1.0","2.0"]}}}]"###); + snapshot!(response["semanticHitCount"], @"0"); let (response, code) = index .search_post(json!({"q": "Captain Marvel", "vector": [1.0, 1.0], @@ -149,6 +161,7 @@ async fn highlighter() { .await; snapshot!(code, @"200 OK"); snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":["2.0","3.0"]}},"_semanticScore":0.99029034},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_formatted":{"title":"Captain Planet","desc":"He's not part of the **BEGIN**Marvel**END** Cinematic Universe","id":"2","_vectors":{"default":["1.0","2.0"]}},"_semanticScore":0.97434163},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_formatted":{"title":"Shazam!","desc":"a **BEGIN**Captain**END** **BEGIN**Marvel**END** ersatz","id":"1","_vectors":{"default":["1.0","3.0"]}},"_semanticScore":0.9472136}]"###); + snapshot!(response["semanticHitCount"], @"3"); // no highlighting on full semantic let (response, code) = index @@ -163,6 +176,7 @@ async fn highlighter() { .await; snapshot!(code, @"200 OK"); snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":["2.0","3.0"]}},"_semanticScore":0.99029034},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_formatted":{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":["1.0","2.0"]}},"_semanticScore":0.97434163},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_formatted":{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":["1.0","3.0"]}}}]"###); + snapshot!(response["semanticHitCount"], @"3"); } #[actix_rt::test] @@ -250,4 +264,20 @@ async fn single_document() { snapshot!(code, @"200 OK"); snapshot!(response["hits"][0], @r###"{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":1.0,"_semanticScore":1.0}"###); + snapshot!(response["semanticHitCount"], @"1"); +} + +#[actix_rt::test] +async fn query_combination() { + let server = Server::new().await; + let index = index_with_documents(&server, &SIMPLE_SEARCH_DOCUMENTS).await; + + // search without query and vector, but with hybrid => still placeholder + let (response, code) = index + .search_post(json!({"hybrid": {"semanticRatio": 1.0}, "showRankingScore": true})) + .await; + + snapshot!(code, @"200 OK"); + snapshot!(response["hits"][0], @r###"{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":1.0,"_semanticScore":1.0}"###); + snapshot!(response["semanticHitCount"], @"1"); } diff --git a/milli/src/search/hybrid.rs b/milli/src/search/hybrid.rs index e45652206..fc13a5e1e 100644 --- a/milli/src/search/hybrid.rs +++ b/milli/src/search/hybrid.rs @@ -84,45 +84,73 @@ impl ScoreWithRatioResult { } } - fn merge(left: Self, right: Self, from: usize, length: usize) -> SearchResult { - let mut documents_ids = - Vec::with_capacity(left.document_scores.len() + right.document_scores.len()); - let mut document_scores = - Vec::with_capacity(left.document_scores.len() + right.document_scores.len()); + fn merge( + vector_results: Self, + keyword_results: Self, + from: usize, + length: usize, + ) -> (SearchResult, u32) { + #[derive(Clone, Copy)] + enum ResultSource { + Semantic, + Keyword, + } + let mut semantic_hit_count = 0; + + let mut documents_ids = Vec::with_capacity( + vector_results.document_scores.len() + keyword_results.document_scores.len(), + ); + let mut document_scores = Vec::with_capacity( + vector_results.document_scores.len() + keyword_results.document_scores.len(), + ); let mut documents_seen = RoaringBitmap::new(); - for (docid, (main_score, _sub_score)) in left + for ((docid, (main_score, _sub_score)), source) in vector_results .document_scores .into_iter() - .merge_by(right.document_scores.into_iter(), |(_, left), (_, right)| { - // the first value is the one with the greatest score - compare_scores(left, right).is_ge() - }) + .zip(std::iter::repeat(ResultSource::Semantic)) + .merge_by( + keyword_results + .document_scores + .into_iter() + .zip(std::iter::repeat(ResultSource::Keyword)), + |((_, left), _), ((_, right), _)| { + // the first value is the one with the greatest score + compare_scores(left, right).is_ge() + }, + ) // remove documents we already saw - .filter(|(docid, _)| documents_seen.insert(*docid)) + .filter(|((docid, _), _)| documents_seen.insert(*docid)) // start skipping **after** the filter .skip(from) // take **after** skipping .take(length) { + if let ResultSource::Semantic = source { + semantic_hit_count += 1; + } documents_ids.push(docid); // TODO: pass both scores to documents_score in some way? document_scores.push(main_score); } - SearchResult { - matching_words: right.matching_words, - candidates: left.candidates | right.candidates, - documents_ids, - document_scores, - degraded: left.degraded | right.degraded, - used_negative_operator: left.used_negative_operator | right.used_negative_operator, - } + ( + SearchResult { + matching_words: keyword_results.matching_words, + candidates: vector_results.candidates | keyword_results.candidates, + documents_ids, + document_scores, + degraded: vector_results.degraded | keyword_results.degraded, + used_negative_operator: vector_results.used_negative_operator + | keyword_results.used_negative_operator, + }, + semantic_hit_count, + ) } } impl<'a> Search<'a> { - pub fn execute_hybrid(&self, semantic_ratio: f32) -> Result { + pub fn execute_hybrid(&self, semantic_ratio: f32) -> Result<(SearchResult, Option)> { // TODO: find classier way to achieve that than to reset vector and query params // create separate keyword and semantic searches let mut search = Search { @@ -148,14 +176,16 @@ impl<'a> Search<'a> { // completely skip semantic search if the results of the keyword search are good enough if self.results_good_enough(&keyword_results, semantic_ratio) { - return Ok(keyword_results); + return Ok((keyword_results, Some(0))); } // no vector search against placeholder search - let Some(query) = search.query.take() else { return Ok(keyword_results) }; + let Some(query) = search.query.take() else { + return Ok((keyword_results, Some(0))); + }; // no embedder, no semantic search let Some(SemanticSearch { vector, embedder_name, embedder }) = semantic else { - return Ok(keyword_results); + return Ok((keyword_results, Some(0))); }; let vector_query = match vector { @@ -166,7 +196,7 @@ impl<'a> Search<'a> { Ok(embedding) => embedding, Err(error) => { tracing::error!(error=%error, "Embedding failed"); - return Ok(keyword_results); + return Ok((keyword_results, Some(0))); } } } @@ -181,10 +211,10 @@ impl<'a> Search<'a> { let keyword_results = ScoreWithRatioResult::new(keyword_results, 1.0 - semantic_ratio); let vector_results = ScoreWithRatioResult::new(vector_results, semantic_ratio); - let merge_results = + let (merge_results, semantic_hit_count) = ScoreWithRatioResult::merge(vector_results, keyword_results, self.offset, self.limit); assert!(merge_results.documents_ids.len() <= self.limit); - Ok(merge_results) + Ok((merge_results, Some(semantic_hit_count))) } fn results_good_enough(&self, keyword_results: &SearchResult, semantic_ratio: f32) -> bool {