4140: Fix bug where search with distinct attribute has wrong totalHits and totalPages r=dureuill a=vivek-26

# Pull Request

## Related issue
Fixes #4130

## What does this PR do?
This PR - 
- Fixes the bug where search with distinct attribute and pagination (no ranking) returns incorrect values for `totalHits` and `totalPages`.
- Add/update unit and integration tests.

## PR checklist
Please check if your PR fulfills the following requirements:
- [x] Does this PR fix an existing issue, or have you listed the changes applied in the PR description (and why they are needed)?
- [x] Have you read the contributing guidelines?
- [x] Have you made sure that the title is accurate and descriptive of the changes?

Thank you so much for contributing to Meilisearch!


Co-authored-by: Vivek Kumar <vivek.26@outlook.com>
Co-authored-by: Louis Dureuil <louis.dureuil@gmail.com>
This commit is contained in:
meili-bors[bot] 2023-10-19 14:00:53 +00:00 committed by GitHub
commit 1032d82643
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 216 additions and 36 deletions

View File

@ -6,21 +6,109 @@ use crate::common::Server;
pub(self) static DOCUMENTS: Lazy<Value> = Lazy::new(|| { pub(self) static DOCUMENTS: Lazy<Value> = Lazy::new(|| {
json!([ json!([
{"productId": 1, "shopId": 1}, {
{"productId": 2, "shopId": 1}, "id": 1,
{"productId": 3, "shopId": 2}, "description": "Leather Jacket",
{"productId": 4, "shopId": 2}, "brand": "Lee Jeans",
{"productId": 5, "shopId": 3}, "product_id": "123456",
{"productId": 6, "shopId": 3}, "color": "Brown"
{"productId": 7, "shopId": 4}, },
{"productId": 8, "shopId": 4}, {
{"productId": 9, "shopId": 5}, "id": 2,
{"productId": 10, "shopId": 5} "description": "Leather Jacket",
"brand": "Lee Jeans",
"product_id": "123456",
"color": "Black"
},
{
"id": 3,
"description": "Leather Jacket",
"brand": "Lee Jeans",
"product_id": "123456",
"color": "Blue"
},
{
"id": 4,
"description": "T-Shirt",
"brand": "Nike",
"product_id": "789012",
"color": "Red"
},
{
"id": 5,
"description": "T-Shirt",
"brand": "Nike",
"product_id": "789012",
"color": "Blue"
},
{
"id": 6,
"description": "Running Shoes",
"brand": "Adidas",
"product_id": "456789",
"color": "Black"
},
{
"id": 7,
"description": "Running Shoes",
"brand": "Adidas",
"product_id": "456789",
"color": "White"
},
{
"id": 8,
"description": "Hoodie",
"brand": "Puma",
"product_id": "987654",
"color": "Gray"
},
{
"id": 9,
"description": "Sweater",
"brand": "Gap",
"product_id": "234567",
"color": "Green"
},
{
"id": 10,
"description": "Sweater",
"brand": "Gap",
"product_id": "234567",
"color": "Red"
},
{
"id": 11,
"description": "Sweater",
"brand": "Gap",
"product_id": "234567",
"color": "Blue"
},
{
"id": 12,
"description": "Jeans",
"brand": "Levi's",
"product_id": "345678",
"color": "Indigo"
},
{
"id": 13,
"description": "Jeans",
"brand": "Levi's",
"product_id": "345678",
"color": "Black"
},
{
"id": 14,
"description": "Jeans",
"brand": "Levi's",
"product_id": "345678",
"color": "Stone Wash"
}
]) ])
}); });
pub(self) static DOCUMENT_PRIMARY_KEY: &str = "productId"; pub(self) static DOCUMENT_PRIMARY_KEY: &str = "id";
pub(self) static DOCUMENT_DISTINCT_KEY: &str = "shopId"; pub(self) static DOCUMENT_DISTINCT_KEY: &str = "product_id";
/// testing: https://github.com/meilisearch/meilisearch/issues/4078 /// testing: https://github.com/meilisearch/meilisearch/issues/4078
#[actix_rt::test] #[actix_rt::test]
@ -33,31 +121,121 @@ async fn distinct_search_with_offset_no_ranking() {
index.update_distinct_attribute(json!(DOCUMENT_DISTINCT_KEY)).await; index.update_distinct_attribute(json!(DOCUMENT_DISTINCT_KEY)).await;
index.wait_task(1).await; index.wait_task(1).await;
fn get_hits(response: Value) -> Vec<i64> { fn get_hits(response: &Value) -> Vec<&str> {
let hits_array = response["hits"].as_array().unwrap(); let hits_array = response["hits"].as_array().unwrap();
hits_array.iter().map(|h| h[DOCUMENT_DISTINCT_KEY].as_i64().unwrap()).collect::<Vec<_>>() hits_array.iter().map(|h| h[DOCUMENT_DISTINCT_KEY].as_str().unwrap()).collect::<Vec<_>>()
} }
let (response, code) = index.search_post(json!({"limit": 2, "offset": 0})).await; let (response, code) = index.search_post(json!({"offset": 0, "limit": 2})).await;
let hits = get_hits(response); let hits = get_hits(&response);
snapshot!(code, @"200 OK"); snapshot!(code, @"200 OK");
snapshot!(hits.len(), @"2"); snapshot!(hits.len(), @"2");
snapshot!(format!("{:?}", hits), @"[1, 2]"); snapshot!(format!("{:?}", hits), @r#"["123456", "789012"]"#);
snapshot!(response["estimatedTotalHits"] , @"11");
let (response, code) = index.search_post(json!({"limit": 2, "offset": 2})).await; let (response, code) = index.search_post(json!({"offset": 2, "limit": 2})).await;
let hits = get_hits(response); let hits = get_hits(&response);
snapshot!(code, @"200 OK"); snapshot!(code, @"200 OK");
snapshot!(hits.len(), @"2"); snapshot!(hits.len(), @"2");
snapshot!(format!("{:?}", hits), @"[3, 4]"); snapshot!(format!("{:?}", hits), @r#"["456789", "987654"]"#);
snapshot!(response["estimatedTotalHits"], @"10");
let (response, code) = index.search_post(json!({"limit": 10, "offset": 4})).await; let (response, code) = index.search_post(json!({"offset": 4, "limit": 2})).await;
let hits = get_hits(response); let hits = get_hits(&response);
snapshot!(code, @"200 OK");
snapshot!(hits.len(), @"2");
snapshot!(format!("{:?}", hits), @r#"["234567", "345678"]"#);
snapshot!(response["estimatedTotalHits"], @"6");
let (response, code) = index.search_post(json!({"offset": 5, "limit": 2})).await;
let hits = get_hits(&response);
snapshot!(code, @"200 OK"); snapshot!(code, @"200 OK");
snapshot!(hits.len(), @"1"); snapshot!(hits.len(), @"1");
snapshot!(format!("{:?}", hits), @"[5]"); snapshot!(format!("{:?}", hits), @r#"["345678"]"#);
snapshot!(response["estimatedTotalHits"], @"6");
let (response, code) = index.search_post(json!({"limit": 10, "offset": 5})).await; let (response, code) = index.search_post(json!({"offset": 6, "limit": 2})).await;
let hits = get_hits(response); let hits = get_hits(&response);
snapshot!(code, @"200 OK"); snapshot!(code, @"200 OK");
snapshot!(hits.len(), @"0"); snapshot!(hits.len(), @"0");
snapshot!(format!("{:?}", hits), @r#"[]"#);
snapshot!(response["estimatedTotalHits"], @"6");
let (response, code) = index.search_post(json!({"offset": 7, "limit": 2})).await;
let hits = get_hits(&response);
snapshot!(code, @"200 OK");
snapshot!(hits.len(), @"0");
snapshot!(format!("{:?}", hits), @r#"[]"#);
snapshot!(response["estimatedTotalHits"], @"6");
}
/// testing: https://github.com/meilisearch/meilisearch/issues/4130
#[actix_rt::test]
async fn distinct_search_with_pagination_no_ranking() {
let server = Server::new().await;
let index = server.index("test");
let documents = DOCUMENTS.clone();
index.add_documents(documents, Some(DOCUMENT_PRIMARY_KEY)).await;
index.update_distinct_attribute(json!(DOCUMENT_DISTINCT_KEY)).await;
index.wait_task(1).await;
fn get_hits(response: &Value) -> Vec<&str> {
let hits_array = response["hits"].as_array().unwrap();
hits_array.iter().map(|h| h[DOCUMENT_DISTINCT_KEY].as_str().unwrap()).collect::<Vec<_>>()
}
let (response, code) = index.search_post(json!({"page": 0, "hitsPerPage": 2})).await;
let hits = get_hits(&response);
snapshot!(code, @"200 OK");
snapshot!(hits.len(), @"0");
snapshot!(format!("{:?}", hits), @r#"[]"#);
snapshot!(response["page"], @"0");
snapshot!(response["totalPages"], @"3");
snapshot!(response["totalHits"], @"6");
let (response, code) = index.search_post(json!({"page": 1, "hitsPerPage": 2})).await;
let hits = get_hits(&response);
snapshot!(code, @"200 OK");
snapshot!(hits.len(), @"2");
snapshot!(format!("{:?}", hits), @r#"["123456", "789012"]"#);
snapshot!(response["page"], @"1");
snapshot!(response["totalPages"], @"3");
snapshot!(response["totalHits"], @"6");
let (response, code) = index.search_post(json!({"page": 2, "hitsPerPage": 2})).await;
let hits = get_hits(&response);
snapshot!(code, @"200 OK");
snapshot!(hits.len(), @"2");
snapshot!(format!("{:?}", hits), @r#"["456789", "987654"]"#);
snapshot!(response["page"], @"2");
snapshot!(response["totalPages"], @"3");
snapshot!(response["totalHits"], @"6");
let (response, code) = index.search_post(json!({"page": 3, "hitsPerPage": 2})).await;
let hits = get_hits(&response);
snapshot!(code, @"200 OK");
snapshot!(hits.len(), @"2");
snapshot!(format!("{:?}", hits), @r#"["234567", "345678"]"#);
snapshot!(response["page"], @"3");
snapshot!(response["totalPages"], @"3");
snapshot!(response["totalHits"], @"6");
let (response, code) = index.search_post(json!({"page": 4, "hitsPerPage": 2})).await;
let hits = get_hits(&response);
snapshot!(code, @"200 OK");
snapshot!(hits.len(), @"0");
snapshot!(format!("{:?}", hits), @r#"[]"#);
snapshot!(response["page"], @"4");
snapshot!(response["totalPages"], @"3");
snapshot!(response["totalHits"], @"6");
let (response, code) = index.search_post(json!({"page": 2, "hitsPerPage": 3})).await;
let hits = get_hits(&response);
snapshot!(code, @"200 OK");
snapshot!(hits.len(), @"3");
snapshot!(format!("{:?}", hits), @r#"["987654", "234567", "345678"]"#);
snapshot!(response["page"], @"2");
snapshot!(response["totalPages"], @"2");
snapshot!(response["totalHits"], @"6");
} }

View File

@ -46,9 +46,8 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
if let Some(distinct_fid) = distinct_fid { if let Some(distinct_fid) = distinct_fid {
let mut excluded = RoaringBitmap::new(); let mut excluded = RoaringBitmap::new();
let mut results = vec![]; let mut results = vec![];
let mut skip = 0;
for docid in universe.iter() { for docid in universe.iter() {
if results.len() >= length { if results.len() >= from + length {
break; break;
} }
if excluded.contains(docid) { if excluded.contains(docid) {
@ -56,16 +55,19 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
} }
distinct_single_docid(ctx.index, ctx.txn, distinct_fid, docid, &mut excluded)?; distinct_single_docid(ctx.index, ctx.txn, distinct_fid, docid, &mut excluded)?;
skip += 1;
if skip <= from {
continue;
}
results.push(docid); results.push(docid);
} }
let mut all_candidates = universe - excluded; let mut all_candidates = universe - excluded;
all_candidates.extend(results.iter().copied()); all_candidates.extend(results.iter().copied());
// drain the results of the skipped elements
// this **must** be done **after** writing the entire results in `all_candidates` to ensure
// e.g. estimatedTotalHits is correct.
if results.len() >= from {
results.drain(..from);
} else {
results.clear();
}
return Ok(BucketSortOutput { return Ok(BucketSortOutput {
scores: vec![Default::default(); results.len()], scores: vec![Default::default(); results.len()],

View File

@ -202,7 +202,7 @@ test_distinct!(
EXTERNAL_DOCUMENTS_IDS.len(), EXTERNAL_DOCUMENTS_IDS.len(),
1, 1,
vec![], vec![],
2 3
); );
test_distinct!( test_distinct!(
// testing: https://github.com/meilisearch/meilisearch/issues/4078 // testing: https://github.com/meilisearch/meilisearch/issues/4078
@ -212,7 +212,7 @@ test_distinct!(
1, 1,
2, 2,
vec![], vec![],
1 3
); );
test_distinct!( test_distinct!(
// testing: https://github.com/meilisearch/meilisearch/issues/4078 // testing: https://github.com/meilisearch/meilisearch/issues/4078
@ -222,7 +222,7 @@ test_distinct!(
EXTERNAL_DOCUMENTS_IDS.len(), EXTERNAL_DOCUMENTS_IDS.len(),
2, 2,
vec![], vec![],
5 7
); );
test_distinct!( test_distinct!(
// testing: https://github.com/meilisearch/meilisearch/issues/4078 // testing: https://github.com/meilisearch/meilisearch/issues/4078
@ -232,5 +232,5 @@ test_distinct!(
2, 2,
4, 4,
vec![], vec![],
3 7
); );