From 55bad07c165476594dd82c37fc1df9bf6eba54cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 26 Apr 2023 10:40:05 +0200 Subject: [PATCH] Fix bug in exact_attribute rr implementation --- milli/src/search/new/exact_attribute.rs | 43 +++++++++++++------------ milli/tests/assets/test_set.ndjson | 4 +-- 2 files changed, 24 insertions(+), 23 deletions(-) diff --git a/milli/src/search/new/exact_attribute.rs b/milli/src/search/new/exact_attribute.rs index 0b95243bc..93dd7c3fc 100644 --- a/milli/src/search/new/exact_attribute.rs +++ b/milli/src/search/new/exact_attribute.rs @@ -91,13 +91,14 @@ impl State { universe: &RoaringBitmap, query_graph: &QueryGraph, ) -> Result { - // An ordered list of the (remaining) query terms, with data extracted from them: - // 0. exact subterm. If it doesn't exist, the term is skipped. - // 1. start position of the term - // 2. id of the term - let mut count_all_positions = 0; + struct ExactTermInfo { + exact_term: ExactTerm, + start_position: u16, + start_term_id: u8, + position_count: usize, + } - let mut exact_term_position_ids: Vec<(ExactTerm, u16, u8)> = + let mut exact_terms: Vec = Vec::with_capacity(query_graph.nodes.len() as usize); for (_, node) in query_graph.nodes.iter() { match &node.data { @@ -107,34 +108,35 @@ impl State { } else { continue; }; - count_all_positions += term.positions.len(); - exact_term_position_ids.push(( + exact_terms.push(ExactTermInfo { exact_term, - *term.positions.start(), - *term.term_ids.start(), - )) + start_position: *term.positions.start(), + start_term_id: *term.term_ids.start(), + position_count: term.positions.len(), + }); } QueryNodeData::Deleted | QueryNodeData::Start | QueryNodeData::End => continue, } } - exact_term_position_ids.sort_by_key(|(_, _, id)| *id); - exact_term_position_ids.dedup_by_key(|(_, _, id)| *id); + exact_terms.sort_by_key(|x| x.start_term_id); + exact_terms.dedup_by_key(|x| x.start_term_id); + let count_all_positions = exact_terms.iter().fold(0, |acc, x| acc + x.position_count); // bail if there is a "hole" (missing word) in remaining query graph - if let Some((_, _, first_id)) = exact_term_position_ids.first() { - if *first_id != 0 { + if let Some(e) = exact_terms.first() { + if e.start_term_id != 0 { return Ok(State::Empty(query_graph.clone())); } } else { return Ok(State::Empty(query_graph.clone())); } let mut previous_id = 0; - for (_, _, id) in exact_term_position_ids.iter().copied() { - if id < previous_id || id - previous_id > 1 { + for e in exact_terms.iter() { + if e.start_term_id < previous_id || e.start_term_id - previous_id > 1 { return Ok(State::Empty(query_graph.clone())); } else { - previous_id = id; + previous_id = e.start_term_id; } } @@ -147,10 +149,9 @@ impl State { // first check that for each term, there exists some attribute that has this term at the correct position //"word-position-docids"; let mut candidates = universe.clone(); - let words_positions: Vec<(Vec<_>, _)> = exact_term_position_ids + let words_positions: Vec<(Vec<_>, _)> = exact_terms .iter() - .copied() - .map(|(term, position, _)| (term.interned_words(ctx).collect(), position)) + .map(|e| (e.exact_term.interned_words(ctx).collect(), e.start_position)) .collect(); for (words, position) in &words_positions { if candidates.is_empty() { diff --git a/milli/tests/assets/test_set.ndjson b/milli/tests/assets/test_set.ndjson index 60ee48dd2..4c83cbe14 100644 --- a/milli/tests/assets/test_set.ndjson +++ b/milli/tests/assets/test_set.ndjson @@ -134,7 +134,7 @@ "typo_rank": 0, "proximity_rank": 0, "attribute_rank": 1, - "exact_rank": 3, + "exact_rank": 1, "asc_desc_rank": 5, "sort_by_rank": 2, "geo_rank": 34692, @@ -369,7 +369,7 @@ "typo_rank": 0, "proximity_rank": 0, "attribute_rank": 1, - "exact_rank": 2, + "exact_rank": 0, "asc_desc_rank": 2, "sort_by_rank": 1, "geo_rank": 9339230,