From b8cda6c300f8ca351a739319b2fcfadcc80e327b Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 14 Mar 2024 17:34:46 +0100 Subject: [PATCH] fix the search cutoff and add a test --- meilisearch/tests/search/mod.rs | 109 +++++++ milli/src/lib.rs | 37 ++- milli/src/score_details.rs | 12 + milli/src/search/hybrid.rs | 2 +- milli/src/search/mod.rs | 4 +- milli/src/search/new/bucket_sort.rs | 16 +- milli/src/search/new/tests/cutoff.rs | 419 +++++++++++++++++++++++++++ milli/src/search/new/tests/mod.rs | 1 + milli/tests/search/mod.rs | 45 +-- 9 files changed, 590 insertions(+), 55 deletions(-) create mode 100644 milli/src/search/new/tests/cutoff.rs diff --git a/meilisearch/tests/search/mod.rs b/meilisearch/tests/search/mod.rs index 90098c5b6..62dd73c63 100644 --- a/meilisearch/tests/search/mod.rs +++ b/meilisearch/tests/search/mod.rs @@ -834,6 +834,115 @@ async fn test_score_details() { .await; } +#[actix_rt::test] +async fn test_degraded_score_details() { + let server = Server::new().await; + let index = server.index("test"); + + let documents = NESTED_DOCUMENTS.clone(); + + index.add_documents(json!(documents), None).await; + // We can't really use anything else than 0ms here; otherwise, the test will get flaky. + let (res, _code) = index.update_settings(json!({ "searchCutoff": 0 })).await; + index.wait_task(res.uid()).await; + + index + .search( + json!({ + "q": "b", + "showRankingScoreDetails": true, + }), + |response, code| { + meili_snap::snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response["hits"]), @r###" + [ + { + "id": 852, + "father": "jean", + "mother": "michelle", + "doggos": [ + { + "name": "bobby", + "age": 2 + }, + { + "name": "buddy", + "age": 4 + } + ], + "cattos": "pésti", + "_vectors": { + "manual": [ + 1, + 2, + 3 + ] + }, + "_rankingScoreDetails": { + "skipped": 0.0 + } + }, + { + "id": 654, + "father": "pierre", + "mother": "sabine", + "doggos": [ + { + "name": "gros bill", + "age": 8 + } + ], + "cattos": [ + "simba", + "pestiféré" + ], + "_vectors": { + "manual": [ + 1, + 2, + 54 + ] + }, + "_rankingScoreDetails": { + "skipped": 0.0 + } + }, + { + "id": 951, + "father": "jean-baptiste", + "mother": "sophie", + "doggos": [ + { + "name": "turbo", + "age": 5 + }, + { + "name": "fast", + "age": 6 + } + ], + "cattos": [ + "moumoute", + "gomez" + ], + "_vectors": { + "manual": [ + 10, + 23, + 32 + ] + }, + "_rankingScoreDetails": { + "skipped": 0.0 + } + } + ] + "###); + }, + ) + .await; +} + #[actix_rt::test] async fn experimental_feature_vector_store() { let server = Server::new().await; diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 896aadb50..df44ca127 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -105,10 +105,15 @@ pub const MAX_WORD_LENGTH: usize = MAX_LMDB_KEY_LENGTH / 2; pub const MAX_POSITION_PER_ATTRIBUTE: u32 = u16::MAX as u32 + 1; -#[derive(Clone, Copy)] +#[derive(Clone)] pub struct TimeBudget { started_at: std::time::Instant, budget: std::time::Duration, + + /// When testing the time budget, ensuring we did more than iteration of the bucket sort can be useful. + /// But to avoid being flaky, the only option is to add the ability to stop after a specific number of calls instead of a `Duration`. + #[cfg(test)] + stop_after: Option<(std::sync::Arc, usize)>, } impl fmt::Debug for TimeBudget { @@ -129,18 +134,40 @@ impl Default for TimeBudget { impl TimeBudget { pub fn new(budget: std::time::Duration) -> Self { - Self { started_at: std::time::Instant::now(), budget } + Self { + started_at: std::time::Instant::now(), + budget, + + #[cfg(test)] + stop_after: None, + } } pub fn max() -> Self { Self::new(std::time::Duration::from_secs(u64::MAX)) } - pub fn exceeded(&self) -> bool { - self.must_stop() + #[cfg(test)] + pub fn with_stop_after(mut self, stop_after: usize) -> Self { + use std::sync::atomic::AtomicUsize; + use std::sync::Arc; + + self.stop_after = Some((Arc::new(AtomicUsize::new(0)), stop_after)); + self } - pub fn must_stop(&self) -> bool { + pub fn exceeded(&self) -> bool { + #[cfg(test)] + if let Some((current, stop_after)) = &self.stop_after { + let current = current.fetch_add(1, std::sync::atomic::Ordering::Relaxed); + if current >= *stop_after { + return true; + } else { + // if a number has been specified then we ignore entirely the time budget + return false; + } + } + self.started_at.elapsed() > self.budget } } diff --git a/milli/src/score_details.rs b/milli/src/score_details.rs index f6b9db58c..f2c6fb58a 100644 --- a/milli/src/score_details.rs +++ b/milli/src/score_details.rs @@ -17,6 +17,9 @@ pub enum ScoreDetails { Sort(Sort), Vector(Vector), GeoSort(GeoSort), + + /// Returned when we don't have the time to finish applying all the subsequent ranking-rules + Skipped, } #[derive(Clone, Copy)] @@ -50,6 +53,7 @@ impl ScoreDetails { ScoreDetails::Sort(_) => None, ScoreDetails::GeoSort(_) => None, ScoreDetails::Vector(_) => None, + ScoreDetails::Skipped => Some(Rank { rank: 0, max_rank: 1 }), } } @@ -97,6 +101,7 @@ impl ScoreDetails { ScoreDetails::Vector(vector) => RankOrValue::Score( vector.value_similarity.as_ref().map(|(_, s)| *s as f64).unwrap_or(0.0f64), ), + ScoreDetails::Skipped => RankOrValue::Score(0.), } } @@ -256,6 +261,13 @@ impl ScoreDetails { details_map.insert(vector, details); order += 1; } + ScoreDetails::Skipped => { + details_map.insert( + "skipped".to_string(), + serde_json::Number::from_f64(0.).unwrap().into(), + ); + order += 1; + } } } details_map diff --git a/milli/src/search/hybrid.rs b/milli/src/search/hybrid.rs index 9d8b3860d..1e14f4430 100644 --- a/milli/src/search/hybrid.rs +++ b/milli/src/search/hybrid.rs @@ -132,7 +132,7 @@ impl<'a> Search<'a> { index: self.index, distribution_shift: self.distribution_shift, embedder_name: self.embedder_name.clone(), - time_budget: self.time_budget, + time_budget: self.time_budget.clone(), }; let vector_query = search.vector.take(); diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index b14d88d03..f6ab8a7de 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -195,7 +195,7 @@ impl<'a> Search<'a> { self.limit, self.distribution_shift, embedder_name, - self.time_budget, + self.time_budget.clone(), )?, None => execute_search( &mut ctx, @@ -211,7 +211,7 @@ impl<'a> Search<'a> { Some(self.words_limit), &mut DefaultSearchLogger, &mut DefaultSearchLogger, - self.time_budget, + self.time_budget.clone(), )?, }; diff --git a/milli/src/search/new/bucket_sort.rs b/milli/src/search/new/bucket_sort.rs index 7fc830c1f..521fcb983 100644 --- a/milli/src/search/new/bucket_sort.rs +++ b/milli/src/search/new/bucket_sort.rs @@ -161,11 +161,21 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( while valid_docids.len() < length { if time_budget.exceeded() { - let bucket = std::mem::take(&mut ranking_rule_universes[cur_ranking_rule_index]); - maybe_add_to_results!(bucket); + loop { + let bucket = std::mem::take(&mut ranking_rule_universes[cur_ranking_rule_index]); + ranking_rule_scores.push(ScoreDetails::Skipped); + maybe_add_to_results!(bucket); + ranking_rule_scores.pop(); + + if cur_ranking_rule_index == 0 { + break; + } + + back!(); + } return Ok(BucketSortOutput { - scores: vec![Default::default(); valid_docids.len()], + scores: valid_scores, docids: valid_docids, all_candidates, degraded: true, diff --git a/milli/src/search/new/tests/cutoff.rs b/milli/src/search/new/tests/cutoff.rs new file mode 100644 index 000000000..4256abc2b --- /dev/null +++ b/milli/src/search/new/tests/cutoff.rs @@ -0,0 +1,419 @@ +//! This module test the search cutoff and ensure a few things: +//! 1. A basic test works and mark the search as degraded +//! 2. A test that ensure the filters are affectively applied even with a cutoff of 0 +//! 3. A test that ensure the cutoff works well with the ranking scores + +use std::time::Duration; + +use big_s::S; +use maplit::hashset; +use meili_snap::snapshot; + +use crate::index::tests::TempIndex; +use crate::{Criterion, Filter, Search, TimeBudget}; + +fn create_index() -> TempIndex { + let index = TempIndex::new(); + + index + .update_settings(|s| { + s.set_primary_key("id".to_owned()); + s.set_searchable_fields(vec!["text".to_owned()]); + s.set_filterable_fields(hashset! { S("id") }); + s.set_criteria(vec![Criterion::Words, Criterion::Typo]); + }) + .unwrap(); + + // reverse the ID / insertion order so we see better what was sorted from what got the insertion order ordering + index + .add_documents(documents!([ + { + "id": 4, + "text": "hella puppo kefir", + }, + { + "id": 3, + "text": "hella puppy kefir", + }, + { + "id": 2, + "text": "hello", + }, + { + "id": 1, + "text": "hello puppy", + }, + { + "id": 0, + "text": "hello puppy kefir", + }, + ])) + .unwrap(); + index +} + +#[test] +fn basic_degraded_search() { + let index = create_index(); + let rtxn = index.read_txn().unwrap(); + + let mut search = Search::new(&rtxn, &index); + search.query("hello puppy kefir"); + search.limit(3); + search.time_budget(TimeBudget::new(Duration::from_millis(0))); + + let result = search.execute().unwrap(); + assert!(result.degraded); +} + +#[test] +fn degraded_search_cannot_skip_filter() { + let index = create_index(); + let rtxn = index.read_txn().unwrap(); + + let mut search = Search::new(&rtxn, &index); + search.query("hello puppy kefir"); + search.limit(100); + search.time_budget(TimeBudget::new(Duration::from_millis(0))); + let filter_condition = Filter::from_str("id > 2").unwrap().unwrap(); + search.filter(filter_condition); + + let result = search.execute().unwrap(); + assert!(result.degraded); + snapshot!(format!("{:?}\n{:?}", result.candidates, result.documents_ids), @r###" + RoaringBitmap<[0, 1]> + [0, 1] + "###); +} + +#[test] +fn degraded_search_and_score_details() { + let index = create_index(); + let rtxn = index.read_txn().unwrap(); + + let mut search = Search::new(&rtxn, &index); + search.query("hello puppy kefir"); + search.limit(4); + search.time_budget(TimeBudget::max()); + + let result = search.execute().unwrap(); + snapshot!(format!("{:#?}\n{:#?}", result.documents_ids, result.document_scores), @r###" + [ + 4, + 1, + 0, + 3, + ] + [ + [ + Words( + Words { + matching_words: 3, + max_matching_words: 3, + }, + ), + Typo( + Typo { + typo_count: 0, + max_typo_count: 3, + }, + ), + ], + [ + Words( + Words { + matching_words: 3, + max_matching_words: 3, + }, + ), + Typo( + Typo { + typo_count: 1, + max_typo_count: 3, + }, + ), + ], + [ + Words( + Words { + matching_words: 3, + max_matching_words: 3, + }, + ), + ], + [ + Words( + Words { + matching_words: 2, + max_matching_words: 3, + }, + ), + ], + ] + "###); + + // Do ONE loop iteration. Not much can be deduced, almost everyone matched the words first bucket. + search.time_budget(TimeBudget::max().with_stop_after(1)); + + let result = search.execute().unwrap(); + snapshot!(format!("{:#?}\n{:#?}", result.documents_ids, result.document_scores), @r###" + [ + 0, + 1, + 4, + 2, + ] + [ + [ + Words( + Words { + matching_words: 3, + max_matching_words: 3, + }, + ), + Skipped, + ], + [ + Words( + Words { + matching_words: 3, + max_matching_words: 3, + }, + ), + Skipped, + ], + [ + Words( + Words { + matching_words: 3, + max_matching_words: 3, + }, + ), + Skipped, + ], + [ + Skipped, + ], + ] + "###); + + // Do TWO loop iterations. The first document should be entirely sorted + search.time_budget(TimeBudget::max().with_stop_after(2)); + + let result = search.execute().unwrap(); + snapshot!(format!("{:#?}\n{:#?}", result.documents_ids, result.document_scores), @r###" + [ + 4, + 0, + 1, + 2, + ] + [ + [ + Words( + Words { + matching_words: 3, + max_matching_words: 3, + }, + ), + Typo( + Typo { + typo_count: 0, + max_typo_count: 3, + }, + ), + ], + [ + Words( + Words { + matching_words: 3, + max_matching_words: 3, + }, + ), + Skipped, + ], + [ + Words( + Words { + matching_words: 3, + max_matching_words: 3, + }, + ), + Skipped, + ], + [ + Skipped, + ], + ] + "###); + + // Do THREE loop iterations. The second document should be entirely sorted as well + search.time_budget(TimeBudget::max().with_stop_after(3)); + + let result = search.execute().unwrap(); + snapshot!(format!("{:#?}\n{:#?}", result.documents_ids, result.document_scores), @r###" + [ + 4, + 1, + 0, + 2, + ] + [ + [ + Words( + Words { + matching_words: 3, + max_matching_words: 3, + }, + ), + Typo( + Typo { + typo_count: 0, + max_typo_count: 3, + }, + ), + ], + [ + Words( + Words { + matching_words: 3, + max_matching_words: 3, + }, + ), + Typo( + Typo { + typo_count: 1, + max_typo_count: 3, + }, + ), + ], + [ + Words( + Words { + matching_words: 3, + max_matching_words: 3, + }, + ), + Skipped, + ], + [ + Skipped, + ], + ] + "###); + + // Do FOUR loop iterations. The third document should be entirely sorted as well + // The words bucket have still not progressed thus the last document doesn't have any info yet. + search.time_budget(TimeBudget::max().with_stop_after(4)); + + let result = search.execute().unwrap(); + snapshot!(format!("{:#?}\n{:#?}", result.documents_ids, result.document_scores), @r###" + [ + 4, + 1, + 0, + 2, + ] + [ + [ + Words( + Words { + matching_words: 3, + max_matching_words: 3, + }, + ), + Typo( + Typo { + typo_count: 0, + max_typo_count: 3, + }, + ), + ], + [ + Words( + Words { + matching_words: 3, + max_matching_words: 3, + }, + ), + Typo( + Typo { + typo_count: 1, + max_typo_count: 3, + }, + ), + ], + [ + Words( + Words { + matching_words: 3, + max_matching_words: 3, + }, + ), + ], + [ + Skipped, + ], + ] + "###); + + // After FIVE loop iteration. The words ranking rule gave us a new bucket. + // Since we reached the limit we were able to early exit without checking the typo ranking rule. + search.time_budget(TimeBudget::max().with_stop_after(5)); + + let result = search.execute().unwrap(); + snapshot!(format!("{:#?}\n{:#?}", result.documents_ids, result.document_scores), @r###" + [ + 4, + 1, + 0, + 3, + ] + [ + [ + Words( + Words { + matching_words: 3, + max_matching_words: 3, + }, + ), + Typo( + Typo { + typo_count: 0, + max_typo_count: 3, + }, + ), + ], + [ + Words( + Words { + matching_words: 3, + max_matching_words: 3, + }, + ), + Typo( + Typo { + typo_count: 1, + max_typo_count: 3, + }, + ), + ], + [ + Words( + Words { + matching_words: 3, + max_matching_words: 3, + }, + ), + ], + [ + Words( + Words { + matching_words: 2, + max_matching_words: 3, + }, + ), + ], + ] + "###); +} diff --git a/milli/src/search/new/tests/mod.rs b/milli/src/search/new/tests/mod.rs index e500d16fb..26199b79b 100644 --- a/milli/src/search/new/tests/mod.rs +++ b/milli/src/search/new/tests/mod.rs @@ -1,5 +1,6 @@ pub mod attribute_fid; pub mod attribute_position; +pub mod cutoff; pub mod distinct; pub mod exactness; pub mod geo_sort; diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs index ab6befa60..9193ab762 100644 --- a/milli/tests/search/mod.rs +++ b/milli/tests/search/mod.rs @@ -1,19 +1,14 @@ use std::cmp::Reverse; use std::collections::HashSet; use std::io::Cursor; -use std::time::Duration; use big_s::S; use either::{Either, Left, Right}; use heed::EnvOpenOptions; use maplit::{btreemap, hashset}; -use meili_snap::snapshot; use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; -use milli::{ - AscDesc, Criterion, DocumentId, Filter, Index, Member, Object, Search, TermsMatchingStrategy, - TimeBudget, -}; +use milli::{AscDesc, Criterion, DocumentId, Index, Member, Object, TermsMatchingStrategy}; use serde::{Deserialize, Deserializer}; use slice_group_by::GroupBy; @@ -354,41 +349,3 @@ where let result = serde_json::Value::deserialize(deserializer)?; Ok(Some(result)) } - -#[test] -fn basic_degraded_search() { - use Criterion::*; - let criteria = vec![Words, Typo, Proximity, Attribute, Exactness]; - let index = setup_search_index_with_criteria(&criteria); - let rtxn = index.read_txn().unwrap(); - - let mut search = Search::new(&rtxn, &index); - search.query(TEST_QUERY); - search.limit(EXTERNAL_DOCUMENTS_IDS.len()); - search.time_budget(TimeBudget::new(Duration::from_millis(0))); - - let result = search.execute().unwrap(); - assert!(result.degraded); -} - -#[test] -fn degraded_search_cannot_skip_filter() { - use Criterion::*; - let criteria = vec![Words, Typo, Proximity, Attribute, Exactness]; - let index = setup_search_index_with_criteria(&criteria); - let rtxn = index.read_txn().unwrap(); - - let mut search = Search::new(&rtxn, &index); - search.query(TEST_QUERY); - search.limit(EXTERNAL_DOCUMENTS_IDS.len()); - search.time_budget(TimeBudget::new(Duration::from_millis(0))); - let filter_condition = Filter::from_str("tag = etiopia").unwrap().unwrap(); - search.filter(filter_condition); - - let result = search.execute().unwrap(); - assert!(result.degraded); - snapshot!(format!("{:?}\n{:?}", result.candidates, result.documents_ids), @r###" - RoaringBitmap<[0, 2, 5, 8, 11, 14]> - [0, 2, 5, 8, 11, 14] - "###); -}