From 460e61b85303949b682900090f2c477e1d62db4d Mon Sep 17 00:00:00 2001 From: Vivek Kumar Date: Thu, 19 Oct 2023 16:48:45 +0530 Subject: [PATCH 1/3] compute all candidates correctly when skipping --- milli/src/search/new/bucket_sort.rs | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/milli/src/search/new/bucket_sort.rs b/milli/src/search/new/bucket_sort.rs index cf2f08cce..df9c14c7d 100644 --- a/milli/src/search/new/bucket_sort.rs +++ b/milli/src/search/new/bucket_sort.rs @@ -46,9 +46,8 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( if let Some(distinct_fid) = distinct_fid { let mut excluded = RoaringBitmap::new(); let mut results = vec![]; - let mut skip = 0; for docid in universe.iter() { - if results.len() >= length { + if results.len() >= from + length { break; } if excluded.contains(docid) { @@ -56,16 +55,16 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( } distinct_single_docid(ctx.index, ctx.txn, distinct_fid, docid, &mut excluded)?; - skip += 1; - if skip <= from { - continue; - } - results.push(docid); } let mut all_candidates = universe - excluded; all_candidates.extend(results.iter().copied()); + if results.len() >= from { + results.drain(..from); + } else { + results.clear(); + } return Ok(BucketSortOutput { scores: vec![Default::default(); results.len()], From aa2cd52797c14926150a95e5e4d2645a71e75cde Mon Sep 17 00:00:00 2001 From: Vivek Kumar Date: Thu, 19 Oct 2023 16:50:14 +0530 Subject: [PATCH 2/3] add/update tests when search with distinct attribute & pagination with no ranking --- meilisearch/tests/search/distinct.rs | 228 ++++++++++++++++++++++++--- milli/tests/search/distinct.rs | 8 +- 2 files changed, 207 insertions(+), 29 deletions(-) diff --git a/meilisearch/tests/search/distinct.rs b/meilisearch/tests/search/distinct.rs index 068bd12b1..35c09035a 100644 --- a/meilisearch/tests/search/distinct.rs +++ b/meilisearch/tests/search/distinct.rs @@ -6,21 +6,109 @@ use crate::common::Server; pub(self) static DOCUMENTS: Lazy = Lazy::new(|| { json!([ - {"productId": 1, "shopId": 1}, - {"productId": 2, "shopId": 1}, - {"productId": 3, "shopId": 2}, - {"productId": 4, "shopId": 2}, - {"productId": 5, "shopId": 3}, - {"productId": 6, "shopId": 3}, - {"productId": 7, "shopId": 4}, - {"productId": 8, "shopId": 4}, - {"productId": 9, "shopId": 5}, - {"productId": 10, "shopId": 5} + { + "id": 1, + "description": "Leather Jacket", + "brand": "Lee Jeans", + "product_id": "123456", + "color": "Brown" + }, + { + "id": 2, + "description": "Leather Jacket", + "brand": "Lee Jeans", + "product_id": "123456", + "color": "Black" + }, + { + "id": 3, + "description": "Leather Jacket", + "brand": "Lee Jeans", + "product_id": "123456", + "color": "Blue" + }, + { + "id": 4, + "description": "T-Shirt", + "brand": "Nike", + "product_id": "789012", + "color": "Red" + }, + { + "id": 5, + "description": "T-Shirt", + "brand": "Nike", + "product_id": "789012", + "color": "Blue" + }, + { + "id": 6, + "description": "Running Shoes", + "brand": "Adidas", + "product_id": "456789", + "color": "Black" + }, + { + "id": 7, + "description": "Running Shoes", + "brand": "Adidas", + "product_id": "456789", + "color": "White" + }, + { + "id": 8, + "description": "Hoodie", + "brand": "Puma", + "product_id": "987654", + "color": "Gray" + }, + { + "id": 9, + "description": "Sweater", + "brand": "Gap", + "product_id": "234567", + "color": "Green" + }, + { + "id": 10, + "description": "Sweater", + "brand": "Gap", + "product_id": "234567", + "color": "Red" + }, + { + "id": 11, + "description": "Sweater", + "brand": "Gap", + "product_id": "234567", + "color": "Blue" + }, + { + "id": 12, + "description": "Jeans", + "brand": "Levi's", + "product_id": "345678", + "color": "Indigo" + }, + { + "id": 13, + "description": "Jeans", + "brand": "Levi's", + "product_id": "345678", + "color": "Black" + }, + { + "id": 14, + "description": "Jeans", + "brand": "Levi's", + "product_id": "345678", + "color": "Stone Wash" + } ]) }); -pub(self) static DOCUMENT_PRIMARY_KEY: &str = "productId"; -pub(self) static DOCUMENT_DISTINCT_KEY: &str = "shopId"; +pub(self) static DOCUMENT_PRIMARY_KEY: &str = "id"; +pub(self) static DOCUMENT_DISTINCT_KEY: &str = "product_id"; /// testing: https://github.com/meilisearch/meilisearch/issues/4078 #[actix_rt::test] @@ -33,31 +121,121 @@ async fn distinct_search_with_offset_no_ranking() { index.update_distinct_attribute(json!(DOCUMENT_DISTINCT_KEY)).await; index.wait_task(1).await; - fn get_hits(response: Value) -> Vec { + fn get_hits(response: &Value) -> Vec<&str> { let hits_array = response["hits"].as_array().unwrap(); - hits_array.iter().map(|h| h[DOCUMENT_DISTINCT_KEY].as_i64().unwrap()).collect::>() + hits_array.iter().map(|h| h[DOCUMENT_DISTINCT_KEY].as_str().unwrap()).collect::>() } - let (response, code) = index.search_post(json!({"limit": 2, "offset": 0})).await; - let hits = get_hits(response); + let (response, code) = index.search_post(json!({"offset": 0, "limit": 2})).await; + let hits = get_hits(&response); snapshot!(code, @"200 OK"); snapshot!(hits.len(), @"2"); - snapshot!(format!("{:?}", hits), @"[1, 2]"); + snapshot!(format!("{:?}", hits), @r#"["123456", "789012"]"#); + snapshot!(response["estimatedTotalHits"] , @"11"); - let (response, code) = index.search_post(json!({"limit": 2, "offset": 2})).await; - let hits = get_hits(response); + let (response, code) = index.search_post(json!({"offset": 2, "limit": 2})).await; + let hits = get_hits(&response); snapshot!(code, @"200 OK"); snapshot!(hits.len(), @"2"); - snapshot!(format!("{:?}", hits), @"[3, 4]"); + snapshot!(format!("{:?}", hits), @r#"["456789", "987654"]"#); + snapshot!(response["estimatedTotalHits"], @"10"); - let (response, code) = index.search_post(json!({"limit": 10, "offset": 4})).await; - let hits = get_hits(response); + let (response, code) = index.search_post(json!({"offset": 4, "limit": 2})).await; + let hits = get_hits(&response); + snapshot!(code, @"200 OK"); + snapshot!(hits.len(), @"2"); + snapshot!(format!("{:?}", hits), @r#"["234567", "345678"]"#); + snapshot!(response["estimatedTotalHits"], @"6"); + + let (response, code) = index.search_post(json!({"offset": 5, "limit": 2})).await; + let hits = get_hits(&response); snapshot!(code, @"200 OK"); snapshot!(hits.len(), @"1"); - snapshot!(format!("{:?}", hits), @"[5]"); + snapshot!(format!("{:?}", hits), @r#"["345678"]"#); + snapshot!(response["estimatedTotalHits"], @"6"); - let (response, code) = index.search_post(json!({"limit": 10, "offset": 5})).await; - let hits = get_hits(response); + let (response, code) = index.search_post(json!({"offset": 6, "limit": 2})).await; + let hits = get_hits(&response); snapshot!(code, @"200 OK"); snapshot!(hits.len(), @"0"); + snapshot!(format!("{:?}", hits), @r#"[]"#); + snapshot!(response["estimatedTotalHits"], @"6"); + + let (response, code) = index.search_post(json!({"offset": 7, "limit": 2})).await; + let hits = get_hits(&response); + snapshot!(code, @"200 OK"); + snapshot!(hits.len(), @"0"); + snapshot!(format!("{:?}", hits), @r#"[]"#); + snapshot!(response["estimatedTotalHits"], @"6"); +} + +/// testing: https://github.com/meilisearch/meilisearch/issues/4130 +#[actix_rt::test] +async fn distinct_search_with_pagination_no_ranking() { + let server = Server::new().await; + let index = server.index("test"); + + let documents = DOCUMENTS.clone(); + index.add_documents(documents, Some(DOCUMENT_PRIMARY_KEY)).await; + index.update_distinct_attribute(json!(DOCUMENT_DISTINCT_KEY)).await; + index.wait_task(1).await; + + fn get_hits(response: &Value) -> Vec<&str> { + let hits_array = response["hits"].as_array().unwrap(); + hits_array.iter().map(|h| h[DOCUMENT_DISTINCT_KEY].as_str().unwrap()).collect::>() + } + + let (response, code) = index.search_post(json!({"page": 0, "hitsPerPage": 2})).await; + let hits = get_hits(&response); + snapshot!(code, @"200 OK"); + snapshot!(hits.len(), @"0"); + snapshot!(format!("{:?}", hits), @r#"[]"#); + snapshot!(response["page"], @"0"); + snapshot!(response["totalPages"], @"3"); + snapshot!(response["totalHits"], @"6"); + + let (response, code) = index.search_post(json!({"page": 1, "hitsPerPage": 2})).await; + let hits = get_hits(&response); + snapshot!(code, @"200 OK"); + snapshot!(hits.len(), @"2"); + snapshot!(format!("{:?}", hits), @r#"["123456", "789012"]"#); + snapshot!(response["page"], @"1"); + snapshot!(response["totalPages"], @"3"); + snapshot!(response["totalHits"], @"6"); + + let (response, code) = index.search_post(json!({"page": 2, "hitsPerPage": 2})).await; + let hits = get_hits(&response); + snapshot!(code, @"200 OK"); + snapshot!(hits.len(), @"2"); + snapshot!(format!("{:?}", hits), @r#"["456789", "987654"]"#); + snapshot!(response["page"], @"2"); + snapshot!(response["totalPages"], @"3"); + snapshot!(response["totalHits"], @"6"); + + let (response, code) = index.search_post(json!({"page": 3, "hitsPerPage": 2})).await; + let hits = get_hits(&response); + snapshot!(code, @"200 OK"); + snapshot!(hits.len(), @"2"); + snapshot!(format!("{:?}", hits), @r#"["234567", "345678"]"#); + snapshot!(response["page"], @"3"); + snapshot!(response["totalPages"], @"3"); + snapshot!(response["totalHits"], @"6"); + + let (response, code) = index.search_post(json!({"page": 4, "hitsPerPage": 2})).await; + let hits = get_hits(&response); + snapshot!(code, @"200 OK"); + snapshot!(hits.len(), @"0"); + snapshot!(format!("{:?}", hits), @r#"[]"#); + snapshot!(response["page"], @"4"); + snapshot!(response["totalPages"], @"3"); + snapshot!(response["totalHits"], @"6"); + + let (response, code) = index.search_post(json!({"page": 2, "hitsPerPage": 3})).await; + let hits = get_hits(&response); + snapshot!(code, @"200 OK"); + snapshot!(hits.len(), @"3"); + snapshot!(format!("{:?}", hits), @r#"["987654", "234567", "345678"]"#); + snapshot!(response["page"], @"2"); + snapshot!(response["totalPages"], @"2"); + snapshot!(response["totalHits"], @"6"); } diff --git a/milli/tests/search/distinct.rs b/milli/tests/search/distinct.rs index e1876286c..fc890dfe8 100644 --- a/milli/tests/search/distinct.rs +++ b/milli/tests/search/distinct.rs @@ -202,7 +202,7 @@ test_distinct!( EXTERNAL_DOCUMENTS_IDS.len(), 1, vec![], - 2 + 3 ); test_distinct!( // testing: https://github.com/meilisearch/meilisearch/issues/4078 @@ -212,7 +212,7 @@ test_distinct!( 1, 2, vec![], - 1 + 3 ); test_distinct!( // testing: https://github.com/meilisearch/meilisearch/issues/4078 @@ -222,7 +222,7 @@ test_distinct!( EXTERNAL_DOCUMENTS_IDS.len(), 2, vec![], - 5 + 7 ); test_distinct!( // testing: https://github.com/meilisearch/meilisearch/issues/4078 @@ -232,5 +232,5 @@ test_distinct!( 2, 4, vec![], - 3 + 7 ); From 18bbadf6455f21407d78a8e485857be1d23b888e Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 19 Oct 2023 15:58:25 +0200 Subject: [PATCH 3/3] Add explanatory comment --- milli/src/search/new/bucket_sort.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/milli/src/search/new/bucket_sort.rs b/milli/src/search/new/bucket_sort.rs index df9c14c7d..b46f6124f 100644 --- a/milli/src/search/new/bucket_sort.rs +++ b/milli/src/search/new/bucket_sort.rs @@ -60,6 +60,9 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( let mut all_candidates = universe - excluded; all_candidates.extend(results.iter().copied()); + // drain the results of the skipped elements + // this **must** be done **after** writing the entire results in `all_candidates` to ensure + // e.g. estimatedTotalHits is correct. if results.len() >= from { results.drain(..from); } else {