From 8c4921b9ddd5fe3e2a9547633357b3c34f8a9761 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 27 Jun 2024 14:17:33 +0200 Subject: [PATCH 1/2] Add failing test on limit+offset for hybrid search --- meilisearch/tests/search/hybrid.rs | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/meilisearch/tests/search/hybrid.rs b/meilisearch/tests/search/hybrid.rs index 31b2940d8..d8069ea5c 100644 --- a/meilisearch/tests/search/hybrid.rs +++ b/meilisearch/tests/search/hybrid.rs @@ -150,6 +150,35 @@ async fn simple_search() { snapshot!(response["semanticHitCount"], @"3"); } +#[actix_rt::test] +async fn limit_offset() { + let server = Server::new().await; + let index = index_with_documents_user_provided(&server, &SIMPLE_SEARCH_DOCUMENTS_VEC).await; + + let (response, code) = index + .search_post( + json!({"q": "Captain", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.2}, "retrieveVectors": true, "offset": 1, "limit": 1}), + ) + .await; + snapshot!(code, @"200 OK"); + snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}}},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}}}]"###); + snapshot!(response["semanticHitCount"], @"0"); + assert_eq!(response["hits"].as_array().unwrap().len(), 1); + + let server = Server::new().await; + let index = index_with_documents_user_provided(&server, &SIMPLE_SEARCH_DOCUMENTS_VEC).await; + + let (response, code) = index + .search_post( + json!({"q": "Captain", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.9}, "retrieveVectors": true, "offset": 1, "limit": 1}), + ) + .await; + snapshot!(code, @"200 OK"); + snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}}}]"###); + snapshot!(response["semanticHitCount"], @"1"); + assert_eq!(response["hits"].as_array().unwrap().len(), 1); +} + #[actix_rt::test] async fn simple_search_hf() { let server = Server::new().await; From e53de15b8e7014232924dc107697ebe8e36adc2b Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 27 Jun 2024 14:21:48 +0200 Subject: [PATCH 2/2] Fix behavior of limit and offset for hybrid search when keyword results are returned early The test is fixed --- meilisearch/tests/search/hybrid.rs | 2 +- milli/src/search/hybrid.rs | 47 ++++++++++++++++++++++++++++-- 2 files changed, 45 insertions(+), 4 deletions(-) diff --git a/meilisearch/tests/search/hybrid.rs b/meilisearch/tests/search/hybrid.rs index d8069ea5c..02768bf60 100644 --- a/meilisearch/tests/search/hybrid.rs +++ b/meilisearch/tests/search/hybrid.rs @@ -161,7 +161,7 @@ async fn limit_offset() { ) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}}},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}}}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}}}]"###); snapshot!(response["semanticHitCount"], @"0"); assert_eq!(response["hits"].as_array().unwrap().len(), 1); diff --git a/milli/src/search/hybrid.rs b/milli/src/search/hybrid.rs index 1c784097d..f7e1aa492 100644 --- a/milli/src/search/hybrid.rs +++ b/milli/src/search/hybrid.rs @@ -178,16 +178,16 @@ impl<'a> Search<'a> { // completely skip semantic search if the results of the keyword search are good enough if self.results_good_enough(&keyword_results, semantic_ratio) { - return Ok((keyword_results, Some(0))); + return Ok(return_keyword_results(self.limit, self.offset, keyword_results)); } // no vector search against placeholder search let Some(query) = search.query.take() else { - return Ok((keyword_results, Some(0))); + return Ok(return_keyword_results(self.limit, self.offset, keyword_results)); }; // no embedder, no semantic search let Some(SemanticSearch { vector, embedder_name, embedder }) = semantic else { - return Ok((keyword_results, Some(0))); + return Ok(return_keyword_results(self.limit, self.offset, keyword_results)); }; let vector_query = match vector { @@ -239,3 +239,44 @@ impl<'a> Search<'a> { true } } + +fn return_keyword_results( + limit: usize, + offset: usize, + SearchResult { + matching_words, + candidates, + mut documents_ids, + mut document_scores, + degraded, + used_negative_operator, + }: SearchResult, +) -> (SearchResult, Option) { + let (documents_ids, document_scores) = if offset >= documents_ids.len() || + // technically redudant because documents_ids.len() == document_scores.len(), + // defensive programming + offset >= document_scores.len() + { + (vec![], vec![]) + } else { + // PANICS: offset < len + documents_ids.rotate_left(offset); + documents_ids.truncate(limit); + + // PANICS: offset < len + document_scores.rotate_left(offset); + document_scores.truncate(limit); + (documents_ids, document_scores) + }; + ( + SearchResult { + matching_words, + candidates, + documents_ids, + document_scores, + degraded, + used_negative_operator, + }, + Some(0), + ) +}