From aac1d769a79b5a49b9d5f421bb3b551d9c1ad7a3 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 11 Apr 2024 19:04:06 +0200 Subject: [PATCH 001/110] Add ranking_score_threshold to milli --- milli/src/search/hybrid.rs | 1 + milli/src/search/mod.rs | 14 ++++++++++++++ milli/src/search/new/bucket_sort.rs | 25 +++++++++++++++++++++++++ milli/src/search/new/matches/mod.rs | 1 + milli/src/search/new/mod.rs | 5 +++++ 5 files changed, 46 insertions(+) diff --git a/milli/src/search/hybrid.rs b/milli/src/search/hybrid.rs index fc13a5e1e..87f922c4c 100644 --- a/milli/src/search/hybrid.rs +++ b/milli/src/search/hybrid.rs @@ -169,6 +169,7 @@ impl<'a> Search<'a> { index: self.index, semantic: self.semantic.clone(), time_budget: self.time_budget.clone(), + ranking_score_threshold: self.ranking_score_threshold, }; let semantic = search.semantic.take(); diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 76068b1f2..f7bcf6e7b 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -50,6 +50,7 @@ pub struct Search<'a> { index: &'a Index, semantic: Option, time_budget: TimeBudget, + ranking_score_threshold: Option, } impl<'a> Search<'a> { @@ -70,6 +71,7 @@ impl<'a> Search<'a> { index, semantic: None, time_budget: TimeBudget::max(), + ranking_score_threshold: None, } } @@ -146,6 +148,14 @@ impl<'a> Search<'a> { self } + pub fn ranking_score_threshold( + &mut self, + ranking_score_threshold: Option, + ) -> &mut Search<'a> { + self.ranking_score_threshold = ranking_score_threshold; + self + } + pub fn execute_for_candidates(&self, has_vector_search: bool) -> Result { if has_vector_search { let ctx = SearchContext::new(self.index, self.rtxn)?; @@ -184,6 +194,7 @@ impl<'a> Search<'a> { embedder_name, embedder, self.time_budget.clone(), + self.ranking_score_threshold, )? } _ => execute_search( @@ -201,6 +212,7 @@ impl<'a> Search<'a> { &mut DefaultSearchLogger, &mut DefaultSearchLogger, self.time_budget.clone(), + self.ranking_score_threshold, )?, }; @@ -239,6 +251,7 @@ impl fmt::Debug for Search<'_> { index: _, semantic, time_budget, + ranking_score_threshold, } = self; f.debug_struct("Search") .field("query", query) @@ -257,6 +270,7 @@ impl fmt::Debug for Search<'_> { &semantic.as_ref().map(|semantic| &semantic.embedder_name), ) .field("time_budget", time_budget) + .field("ranking_score_threshold", ranking_score_threshold) .finish() } } diff --git a/milli/src/search/new/bucket_sort.rs b/milli/src/search/new/bucket_sort.rs index e9bc5449d..b15e735d0 100644 --- a/milli/src/search/new/bucket_sort.rs +++ b/milli/src/search/new/bucket_sort.rs @@ -28,6 +28,7 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( scoring_strategy: ScoringStrategy, logger: &mut dyn SearchLogger, time_budget: TimeBudget, + ranking_score_threshold: Option, ) -> Result { logger.initial_query(query); logger.ranking_rules(&ranking_rules); @@ -144,6 +145,7 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( ctx, from, length, + ranking_score_threshold, logger, &mut valid_docids, &mut valid_scores, @@ -164,7 +166,9 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( loop { let bucket = std::mem::take(&mut ranking_rule_universes[cur_ranking_rule_index]); ranking_rule_scores.push(ScoreDetails::Skipped); + maybe_add_to_results!(bucket); + ranking_rule_scores.pop(); if cur_ranking_rule_index == 0 { @@ -220,6 +224,17 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( debug_assert!( ranking_rule_universes[cur_ranking_rule_index].is_superset(&next_bucket.candidates) ); + + if let Some(ranking_score_threshold) = ranking_score_threshold { + let current_score = ScoreDetails::global_score(ranking_rule_scores.iter()); + if current_score < ranking_score_threshold { + all_candidates -= + next_bucket.candidates | &ranking_rule_universes[cur_ranking_rule_index]; + back!(); + continue; + } + } + ranking_rule_universes[cur_ranking_rule_index] -= &next_bucket.candidates; if cur_ranking_rule_index == ranking_rules_len - 1 @@ -262,6 +277,7 @@ fn maybe_add_to_results<'ctx, Q: RankingRuleQueryTrait>( ctx: &mut SearchContext<'ctx>, from: usize, length: usize, + ranking_score_threshold: Option, logger: &mut dyn SearchLogger, valid_docids: &mut Vec, @@ -279,6 +295,15 @@ fn maybe_add_to_results<'ctx, Q: RankingRuleQueryTrait>( ranking_rule_scores: &[ScoreDetails], candidates: RoaringBitmap, ) -> Result<()> { + // remove candidates from the universe without adding them to result if their score is below the threshold + if let Some(ranking_score_threshold) = ranking_score_threshold { + let score = ScoreDetails::global_score(ranking_rule_scores.iter()); + if score < ranking_score_threshold { + *all_candidates -= candidates | &ranking_rule_universes[cur_ranking_rule_index]; + return Ok(()); + } + } + // First apply the distinct rule on the candidates, reducing the universes if necessary let candidates = if let Some(distinct_fid) = distinct_fid { let DistinctOutput { remaining, excluded } = diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs index f121971b8..87ddb2915 100644 --- a/milli/src/search/new/matches/mod.rs +++ b/milli/src/search/new/matches/mod.rs @@ -523,6 +523,7 @@ mod tests { &mut crate::DefaultSearchLogger, &mut crate::DefaultSearchLogger, TimeBudget::max(), + None, ) .unwrap(); diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index e152dd233..bbeab31fd 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -568,6 +568,7 @@ pub fn execute_vector_search( embedder_name: &str, embedder: &Embedder, time_budget: TimeBudget, + ranking_score_threshold: Option, ) -> Result { check_sort_criteria(ctx, sort_criteria.as_ref())?; @@ -597,6 +598,7 @@ pub fn execute_vector_search( scoring_strategy, placeholder_search_logger, time_budget, + ranking_score_threshold, )?; Ok(PartialSearchResult { @@ -626,6 +628,7 @@ pub fn execute_search( placeholder_search_logger: &mut dyn SearchLogger, query_graph_logger: &mut dyn SearchLogger, time_budget: TimeBudget, + ranking_score_threshold: Option, ) -> Result { check_sort_criteria(ctx, sort_criteria.as_ref())?; @@ -714,6 +717,7 @@ pub fn execute_search( scoring_strategy, query_graph_logger, time_budget, + ranking_score_threshold, )? } else { let ranking_rules = @@ -728,6 +732,7 @@ pub fn execute_search( scoring_strategy, placeholder_search_logger, time_budget, + ranking_score_threshold, )? }; From c26db7878c99fa0ca3e866d0c690181e41fb118c Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 11 Apr 2024 19:04:43 +0200 Subject: [PATCH 002/110] Expose rankingScoreThreshold in API --- meilisearch-types/src/deserr/mod.rs | 1 + meilisearch-types/src/error.rs | 10 ++++ .../src/analytics/segment_analytics.rs | 3 + .../src/routes/indexes/facet_search.rs | 8 ++- meilisearch/src/routes/indexes/search.rs | 23 +++++++- meilisearch/src/search.rs | 55 +++++++++++++++---- milli/examples/search.rs | 1 + 7 files changed, 85 insertions(+), 16 deletions(-) diff --git a/meilisearch-types/src/deserr/mod.rs b/meilisearch-types/src/deserr/mod.rs index c593c50fb..198a4e7b7 100644 --- a/meilisearch-types/src/deserr/mod.rs +++ b/meilisearch-types/src/deserr/mod.rs @@ -189,4 +189,5 @@ merge_with_error_impl_take_error_message!(ParseTaskKindError); merge_with_error_impl_take_error_message!(ParseTaskStatusError); merge_with_error_impl_take_error_message!(IndexUidFormatError); merge_with_error_impl_take_error_message!(InvalidSearchSemanticRatio); +merge_with_error_impl_take_error_message!(InvalidSearchRankingScoreThreshold); merge_with_error_impl_take_error_message!(InvalidSimilarId); diff --git a/meilisearch-types/src/error.rs b/meilisearch-types/src/error.rs index d2218807f..bf8eaba1c 100644 --- a/meilisearch-types/src/error.rs +++ b/meilisearch-types/src/error.rs @@ -241,6 +241,7 @@ InvalidSearchAttributesToCrop , InvalidRequest , BAD_REQUEST ; InvalidSearchAttributesToHighlight , InvalidRequest , BAD_REQUEST ; InvalidSimilarAttributesToRetrieve , InvalidRequest , BAD_REQUEST ; InvalidSearchAttributesToRetrieve , InvalidRequest , BAD_REQUEST ; +InvalidSearchRankingScoreThreshold , InvalidRequest , BAD_REQUEST ; InvalidSearchCropLength , InvalidRequest , BAD_REQUEST ; InvalidSearchCropMarker , InvalidRequest , BAD_REQUEST ; InvalidSearchFacets , InvalidRequest , BAD_REQUEST ; @@ -505,6 +506,15 @@ impl fmt::Display for deserr_codes::InvalidSimilarId { } } +impl fmt::Display for deserr_codes::InvalidSearchRankingScoreThreshold { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!( + f, + "the value of `rankingScoreThreshold` is invalid, expected a float between `0.0` and `1.0`." + ) + } +} + #[macro_export] macro_rules! internal_error { ($target:ty : $($other:path), *) => { diff --git a/meilisearch/src/analytics/segment_analytics.rs b/meilisearch/src/analytics/segment_analytics.rs index add430893..10583da1b 100644 --- a/meilisearch/src/analytics/segment_analytics.rs +++ b/meilisearch/src/analytics/segment_analytics.rs @@ -676,6 +676,7 @@ impl SearchAggregator { matching_strategy, attributes_to_search_on, hybrid, + ranking_score_threshold, } = query; let mut ret = Self::default(); @@ -1087,6 +1088,7 @@ impl MultiSearchAggregator { matching_strategy: _, attributes_to_search_on: _, hybrid: _, + ranking_score_threshold: _, } = query; index_uid.as_str() @@ -1234,6 +1236,7 @@ impl FacetSearchAggregator { matching_strategy, attributes_to_search_on, hybrid, + ranking_score_threshold, } = query; let mut ret = Self::default(); diff --git a/meilisearch/src/routes/indexes/facet_search.rs b/meilisearch/src/routes/indexes/facet_search.rs index 3f05fa846..845b476fe 100644 --- a/meilisearch/src/routes/indexes/facet_search.rs +++ b/meilisearch/src/routes/indexes/facet_search.rs @@ -14,9 +14,7 @@ use crate::extractors::authentication::policies::*; use crate::extractors::authentication::GuardedData; use crate::routes::indexes::search::search_kind; use crate::search::{ - add_search_rules, perform_facet_search, HybridQuery, MatchingStrategy, SearchQuery, - DEFAULT_CROP_LENGTH, DEFAULT_CROP_MARKER, DEFAULT_HIGHLIGHT_POST_TAG, - DEFAULT_HIGHLIGHT_PRE_TAG, DEFAULT_SEARCH_LIMIT, DEFAULT_SEARCH_OFFSET, + add_search_rules, perform_facet_search, HybridQuery, MatchingStrategy, RankingScoreThreshold, SearchQuery, DEFAULT_CROP_LENGTH, DEFAULT_CROP_MARKER, DEFAULT_HIGHLIGHT_POST_TAG, DEFAULT_HIGHLIGHT_PRE_TAG, DEFAULT_SEARCH_LIMIT, DEFAULT_SEARCH_OFFSET }; use crate::search_queue::SearchQueue; @@ -46,6 +44,8 @@ pub struct FacetSearchQuery { pub matching_strategy: MatchingStrategy, #[deserr(default, error = DeserrJsonError, default)] pub attributes_to_search_on: Option>, + #[deserr(default, error = DeserrJsonError, default)] + pub ranking_score_threshold: Option, } pub async fn search( @@ -103,6 +103,7 @@ impl From for SearchQuery { matching_strategy, attributes_to_search_on, hybrid, + ranking_score_threshold, } = value; SearchQuery { @@ -128,6 +129,7 @@ impl From for SearchQuery { vector, attributes_to_search_on, hybrid, + ranking_score_threshold, } } } diff --git a/meilisearch/src/routes/indexes/search.rs b/meilisearch/src/routes/indexes/search.rs index 8628da6d9..7f5acbd37 100644 --- a/meilisearch/src/routes/indexes/search.rs +++ b/meilisearch/src/routes/indexes/search.rs @@ -19,9 +19,10 @@ use crate::extractors::authentication::GuardedData; use crate::extractors::sequential_extractor::SeqHandler; use crate::metrics::MEILISEARCH_DEGRADED_SEARCH_REQUESTS; use crate::search::{ - add_search_rules, perform_search, HybridQuery, MatchingStrategy, SearchKind, SearchQuery, - SemanticRatio, DEFAULT_CROP_LENGTH, DEFAULT_CROP_MARKER, DEFAULT_HIGHLIGHT_POST_TAG, - DEFAULT_HIGHLIGHT_PRE_TAG, DEFAULT_SEARCH_LIMIT, DEFAULT_SEARCH_OFFSET, DEFAULT_SEMANTIC_RATIO, + add_search_rules, perform_search, HybridQuery, MatchingStrategy, RankingScoreThreshold, + SearchKind, SearchQuery, SemanticRatio, DEFAULT_CROP_LENGTH, DEFAULT_CROP_MARKER, + DEFAULT_HIGHLIGHT_POST_TAG, DEFAULT_HIGHLIGHT_PRE_TAG, DEFAULT_SEARCH_LIMIT, + DEFAULT_SEARCH_OFFSET, DEFAULT_SEMANTIC_RATIO, }; use crate::search_queue::SearchQueue; @@ -82,6 +83,21 @@ pub struct SearchQueryGet { pub hybrid_embedder: Option, #[deserr(default, error = DeserrQueryParamError)] pub hybrid_semantic_ratio: Option, + #[deserr(default, error = DeserrQueryParamError, default)] + pub ranking_score_threshold: Option, +} + +#[derive(Debug, Clone, Copy, PartialEq, deserr::Deserr)] +#[deserr(try_from(String) = TryFrom::try_from -> InvalidSearchRankingScoreThreshold)] +pub struct RankingScoreThresholdGet(RankingScoreThreshold); + +impl std::convert::TryFrom for RankingScoreThresholdGet { + type Error = InvalidSearchRankingScoreThreshold; + + fn try_from(s: String) -> Result { + let f: f64 = s.parse().map_err(|_| InvalidSearchRankingScoreThreshold)?; + Ok(RankingScoreThresholdGet(RankingScoreThreshold::try_from(f)?)) + } } #[derive(Debug, Clone, Copy, Default, PartialEq, deserr::Deserr)] @@ -152,6 +168,7 @@ impl From for SearchQuery { matching_strategy: other.matching_strategy, attributes_to_search_on: other.attributes_to_search_on.map(|o| o.into_iter().collect()), hybrid, + ranking_score_threshold: other.ranking_score_threshold.map(|o| o.0), } } } diff --git a/meilisearch/src/search.rs b/meilisearch/src/search.rs index c6c4e88ca..f4648a9d5 100644 --- a/meilisearch/src/search.rs +++ b/meilisearch/src/search.rs @@ -87,6 +87,26 @@ pub struct SearchQuery { pub matching_strategy: MatchingStrategy, #[deserr(default, error = DeserrJsonError, default)] pub attributes_to_search_on: Option>, + #[deserr(default, error = DeserrJsonError, default)] + pub ranking_score_threshold: Option, +} + +#[derive(Debug, Clone, Copy, PartialEq, Deserr)] +#[deserr(try_from(f64) = TryFrom::try_from -> InvalidSearchRankingScoreThreshold)] +pub struct RankingScoreThreshold(f64); + +impl std::convert::TryFrom for RankingScoreThreshold { + type Error = InvalidSearchRankingScoreThreshold; + + fn try_from(f: f64) -> Result { + // the suggested "fix" is: `!(0.0..=1.0).contains(&f)`` which is allegedly less readable + #[allow(clippy::manual_range_contains)] + if f > 1.0 || f < 0.0 { + Err(InvalidSearchRankingScoreThreshold) + } else { + Ok(RankingScoreThreshold(f)) + } + } } // Since this structure is logged A LOT we're going to reduce the number of things it logs to the bare minimum. @@ -117,6 +137,7 @@ impl fmt::Debug for SearchQuery { crop_marker, matching_strategy, attributes_to_search_on, + ranking_score_threshold, } = self; let mut debug = f.debug_struct("SearchQuery"); @@ -188,6 +209,9 @@ impl fmt::Debug for SearchQuery { debug.field("highlight_pre_tag", &highlight_pre_tag); debug.field("highlight_post_tag", &highlight_post_tag); debug.field("crop_marker", &crop_marker); + if let Some(ranking_score_threshold) = ranking_score_threshold { + debug.field("ranking_score_threshold", &ranking_score_threshold); + } debug.finish() } @@ -356,6 +380,8 @@ pub struct SearchQueryWithIndex { pub matching_strategy: MatchingStrategy, #[deserr(default, error = DeserrJsonError, default)] pub attributes_to_search_on: Option>, + #[deserr(default, error = DeserrJsonError, default)] + pub ranking_score_threshold: Option, } impl SearchQueryWithIndex { @@ -384,6 +410,7 @@ impl SearchQueryWithIndex { matching_strategy, attributes_to_search_on, hybrid, + ranking_score_threshold, } = self; ( index_uid, @@ -410,6 +437,7 @@ impl SearchQueryWithIndex { matching_strategy, attributes_to_search_on, hybrid, + ranking_score_threshold, // do not use ..Default::default() here, // rather add any missing field from `SearchQuery` to `SearchQueryWithIndex` }, @@ -661,6 +689,7 @@ fn prepare_search<'t>( ) -> Result<(milli::Search<'t>, bool, usize, usize), MeilisearchHttpError> { let mut search = index.search(rtxn); search.time_budget(time_budget); + search.ranking_score_threshold(query.ranking_score_threshold.map(|rst| rst.0)); match search_kind { SearchKind::KeywordOnly => { @@ -702,11 +731,16 @@ fn prepare_search<'t>( .unwrap_or(DEFAULT_PAGINATION_MAX_TOTAL_HITS); search.exhaustive_number_hits(is_finite_pagination); - search.scoring_strategy(if query.show_ranking_score || query.show_ranking_score_details { - ScoringStrategy::Detailed - } else { - ScoringStrategy::Skip - }); + search.scoring_strategy( + if query.show_ranking_score + || query.show_ranking_score_details + || query.ranking_score_threshold.is_some() + { + ScoringStrategy::Detailed + } else { + ScoringStrategy::Skip + }, + ); // compute the offset on the limit depending on the pagination mode. let (offset, limit) = if is_finite_pagination { @@ -784,10 +818,6 @@ pub fn perform_search( let SearchQuery { q, - vector: _, - hybrid: _, - // already computed from prepare_search - offset: _, limit, page, hits_per_page, @@ -798,14 +828,19 @@ pub fn perform_search( show_matches_position, show_ranking_score, show_ranking_score_details, - filter: _, sort, facets, highlight_pre_tag, highlight_post_tag, crop_marker, + // already used in prepare_search + vector: _, + hybrid: _, + offset: _, + ranking_score_threshold: _, matching_strategy: _, attributes_to_search_on: _, + filter: _, } = query; let format = AttributesFormat { diff --git a/milli/examples/search.rs b/milli/examples/search.rs index 2779f5b15..0195c396f 100644 --- a/milli/examples/search.rs +++ b/milli/examples/search.rs @@ -66,6 +66,7 @@ fn main() -> Result<(), Box> { &mut DefaultSearchLogger, logger, TimeBudget::max(), + None, )?; if let Some((logger, dir)) = detailed_logger { logger.finish(&mut ctx, Path::new(dir))?; From 4f03b0cf5b87a6a5fdf2d21f0c8d858b167576e4 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 30 May 2024 10:34:09 +0200 Subject: [PATCH 003/110] Add ranking score threshold to similar --- milli/src/search/similar.rs | 49 +++++++++++++++++++++++++++++++------ 1 file changed, 42 insertions(+), 7 deletions(-) diff --git a/milli/src/search/similar.rs b/milli/src/search/similar.rs index 49b7c876f..bf5cc323f 100644 --- a/milli/src/search/similar.rs +++ b/milli/src/search/similar.rs @@ -17,6 +17,7 @@ pub struct Similar<'a> { index: &'a Index, embedder_name: String, embedder: Arc, + ranking_score_threshold: Option, } impl<'a> Similar<'a> { @@ -29,7 +30,17 @@ impl<'a> Similar<'a> { embedder_name: String, embedder: Arc, ) -> Self { - Self { id, filter: None, offset, limit, rtxn, index, embedder_name, embedder } + Self { + id, + filter: None, + offset, + limit, + rtxn, + index, + embedder_name, + embedder, + ranking_score_threshold: None, + } } pub fn filter(&mut self, filter: Filter<'a>) -> &mut Self { @@ -37,8 +48,18 @@ impl<'a> Similar<'a> { self } + pub fn ranking_score_threshold(&mut self, ranking_score_threshold: f64) -> &mut Self { + self.ranking_score_threshold = Some(ranking_score_threshold); + self + } + pub fn execute(&self) -> Result { - let universe = filtered_universe(self.index, self.rtxn, &self.filter)?; + let mut universe = filtered_universe(self.index, self.rtxn, &self.filter)?; + + // we never want to receive the docid + universe.remove(self.id); + + let universe = universe; let embedder_index = self.index @@ -77,6 +98,8 @@ impl<'a> Similar<'a> { let mut documents_seen = RoaringBitmap::new(); documents_seen.insert(self.id); + let mut candidates = universe; + for (docid, distance) in results .into_iter() // skip documents we've already seen & mark that we saw the current document @@ -85,8 +108,6 @@ impl<'a> Similar<'a> { // take **after** filter and skip so that we get exactly limit elements if available .take(self.limit) { - documents_ids.push(docid); - let score = 1.0 - distance; let score = self .embedder @@ -94,14 +115,28 @@ impl<'a> Similar<'a> { .map(|distribution| distribution.shift(score)) .unwrap_or(score); - let score = ScoreDetails::Vector(score_details::Vector { similarity: Some(score) }); + let score_details = + vec![ScoreDetails::Vector(score_details::Vector { similarity: Some(score) })]; - document_scores.push(vec![score]); + let score = ScoreDetails::global_score(score_details.iter()); + + if let Some(ranking_score_threshold) = &self.ranking_score_threshold { + if score < *ranking_score_threshold { + // this document is no longer a candidate + candidates.remove(docid); + // any document after this one is no longer a candidate either, so restrict the set to documents already seen. + candidates &= documents_seen; + break; + } + } + + documents_ids.push(docid); + document_scores.push(score_details); } Ok(SearchResult { matching_words: Default::default(), - candidates: universe, + candidates, documents_ids, document_scores, degraded: false, From 7ce2691374daede46dfb2a8051ea062811f391f0 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 30 May 2024 11:21:31 +0200 Subject: [PATCH 004/110] Add ranking score threshold to similar API --- meilisearch-types/src/deserr/mod.rs | 1 + meilisearch-types/src/error.rs | 7 +++++ meilisearch/src/routes/indexes/similar.rs | 35 +++++++++++++++++------ meilisearch/src/search.rs | 25 ++++++++++++++++ 4 files changed, 59 insertions(+), 9 deletions(-) diff --git a/meilisearch-types/src/deserr/mod.rs b/meilisearch-types/src/deserr/mod.rs index 198a4e7b7..1c1b0e987 100644 --- a/meilisearch-types/src/deserr/mod.rs +++ b/meilisearch-types/src/deserr/mod.rs @@ -190,4 +190,5 @@ merge_with_error_impl_take_error_message!(ParseTaskStatusError); merge_with_error_impl_take_error_message!(IndexUidFormatError); merge_with_error_impl_take_error_message!(InvalidSearchSemanticRatio); merge_with_error_impl_take_error_message!(InvalidSearchRankingScoreThreshold); +merge_with_error_impl_take_error_message!(InvalidSimilarRankingScoreThreshold); merge_with_error_impl_take_error_message!(InvalidSimilarId); diff --git a/meilisearch-types/src/error.rs b/meilisearch-types/src/error.rs index bf8eaba1c..150c56b9d 100644 --- a/meilisearch-types/src/error.rs +++ b/meilisearch-types/src/error.rs @@ -242,6 +242,7 @@ InvalidSearchAttributesToHighlight , InvalidRequest , BAD_REQUEST ; InvalidSimilarAttributesToRetrieve , InvalidRequest , BAD_REQUEST ; InvalidSearchAttributesToRetrieve , InvalidRequest , BAD_REQUEST ; InvalidSearchRankingScoreThreshold , InvalidRequest , BAD_REQUEST ; +InvalidSimilarRankingScoreThreshold , InvalidRequest , BAD_REQUEST ; InvalidSearchCropLength , InvalidRequest , BAD_REQUEST ; InvalidSearchCropMarker , InvalidRequest , BAD_REQUEST ; InvalidSearchFacets , InvalidRequest , BAD_REQUEST ; @@ -515,6 +516,12 @@ impl fmt::Display for deserr_codes::InvalidSearchRankingScoreThreshold { } } +impl fmt::Display for deserr_codes::InvalidSimilarRankingScoreThreshold { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + deserr_codes::InvalidSearchRankingScoreThreshold.fmt(f) + } +} + #[macro_export] macro_rules! internal_error { ($target:ty : $($other:path), *) => { diff --git a/meilisearch/src/routes/indexes/similar.rs b/meilisearch/src/routes/indexes/similar.rs index da73dd63b..518fedab7 100644 --- a/meilisearch/src/routes/indexes/similar.rs +++ b/meilisearch/src/routes/indexes/similar.rs @@ -6,8 +6,8 @@ use meilisearch_types::deserr::query_params::Param; use meilisearch_types::deserr::{DeserrJsonError, DeserrQueryParamError}; use meilisearch_types::error::deserr_codes::{ InvalidEmbedder, InvalidSimilarAttributesToRetrieve, InvalidSimilarFilter, InvalidSimilarId, - InvalidSimilarLimit, InvalidSimilarOffset, InvalidSimilarShowRankingScore, - InvalidSimilarShowRankingScoreDetails, + InvalidSimilarLimit, InvalidSimilarOffset, InvalidSimilarRankingScoreThreshold, + InvalidSimilarShowRankingScore, InvalidSimilarShowRankingScoreDetails, }; use meilisearch_types::error::{ErrorCode as _, ResponseError}; use meilisearch_types::index_uid::IndexUid; @@ -21,8 +21,8 @@ use crate::analytics::{Analytics, SimilarAggregator}; use crate::extractors::authentication::GuardedData; use crate::extractors::sequential_extractor::SeqHandler; use crate::search::{ - add_search_rules, perform_similar, SearchKind, SimilarQuery, SimilarResult, - DEFAULT_SEARCH_LIMIT, DEFAULT_SEARCH_OFFSET, + add_search_rules, perform_similar, RankingScoreThresholdSimilar, SearchKind, SimilarQuery, + SimilarResult, DEFAULT_SEARCH_LIMIT, DEFAULT_SEARCH_OFFSET, }; pub fn configure(cfg: &mut web::ServiceConfig) { @@ -42,9 +42,7 @@ pub async fn similar_get( ) -> Result { let index_uid = IndexUid::try_from(index_uid.into_inner())?; - let query = params.0.try_into().map_err(|code: InvalidSimilarId| { - ResponseError::from_msg(code.to_string(), code.error_code()) - })?; + let query = params.0.try_into()?; let mut aggregate = SimilarAggregator::from_query(&query, &req); @@ -130,12 +128,27 @@ pub struct SimilarQueryGet { show_ranking_score: Param, #[deserr(default, error = DeserrQueryParamError)] show_ranking_score_details: Param, + #[deserr(default, error = DeserrQueryParamError, default)] + pub ranking_score_threshold: Option, #[deserr(default, error = DeserrQueryParamError)] pub embedder: Option, } +#[derive(Debug, Clone, Copy, PartialEq, deserr::Deserr)] +#[deserr(try_from(String) = TryFrom::try_from -> InvalidSimilarRankingScoreThreshold)] +pub struct RankingScoreThresholdGet(RankingScoreThresholdSimilar); + +impl std::convert::TryFrom for RankingScoreThresholdGet { + type Error = InvalidSimilarRankingScoreThreshold; + + fn try_from(s: String) -> Result { + let f: f64 = s.parse().map_err(|_| InvalidSimilarRankingScoreThreshold)?; + Ok(RankingScoreThresholdGet(RankingScoreThresholdSimilar::try_from(f)?)) + } +} + impl TryFrom for SimilarQuery { - type Error = InvalidSimilarId; + type Error = ResponseError; fn try_from( SimilarQueryGet { @@ -147,6 +160,7 @@ impl TryFrom for SimilarQuery { show_ranking_score, show_ranking_score_details, embedder, + ranking_score_threshold, }: SimilarQueryGet, ) -> Result { let filter = match filter { @@ -158,7 +172,9 @@ impl TryFrom for SimilarQuery { }; Ok(SimilarQuery { - id: id.0.try_into()?, + id: id.0.try_into().map_err(|code: InvalidSimilarId| { + ResponseError::from_msg(code.to_string(), code.error_code()) + })?, offset: offset.0, limit: limit.0, filter, @@ -166,6 +182,7 @@ impl TryFrom for SimilarQuery { attributes_to_retrieve: attributes_to_retrieve.map(|o| o.into_iter().collect()), show_ranking_score: show_ranking_score.0, show_ranking_score_details: show_ranking_score_details.0, + ranking_score_threshold: ranking_score_threshold.map(|x| x.0), }) } } diff --git a/meilisearch/src/search.rs b/meilisearch/src/search.rs index f4648a9d5..23f9d3f79 100644 --- a/meilisearch/src/search.rs +++ b/meilisearch/src/search.rs @@ -109,6 +109,24 @@ impl std::convert::TryFrom for RankingScoreThreshold { } } +#[derive(Debug, Clone, Copy, PartialEq, Deserr)] +#[deserr(try_from(f64) = TryFrom::try_from -> InvalidSimilarRankingScoreThreshold)] +pub struct RankingScoreThresholdSimilar(f64); + +impl std::convert::TryFrom for RankingScoreThresholdSimilar { + type Error = InvalidSimilarRankingScoreThreshold; + + fn try_from(f: f64) -> Result { + // the suggested "fix" is: `!(0.0..=1.0).contains(&f)`` which is allegedly less readable + #[allow(clippy::manual_range_contains)] + if f > 1.0 || f < 0.0 { + Err(InvalidSimilarRankingScoreThreshold) + } else { + Ok(Self(f)) + } + } +} + // Since this structure is logged A LOT we're going to reduce the number of things it logs to the bare minimum. // - Only what IS used, we know everything else is set to None so there is no need to print it // - Re-order the most important field to debug first @@ -464,6 +482,8 @@ pub struct SimilarQuery { pub show_ranking_score: bool, #[deserr(default, error = DeserrJsonError, default)] pub show_ranking_score_details: bool, + #[deserr(default, error = DeserrJsonError, default)] + pub ranking_score_threshold: Option, } #[derive(Debug, Clone, PartialEq, Deserr)] @@ -1102,6 +1122,7 @@ pub fn perform_similar( attributes_to_retrieve, show_ranking_score, show_ranking_score_details, + ranking_score_threshold, } = query; // using let-else rather than `?` so that the borrow checker identifies we're always returning here, @@ -1125,6 +1146,10 @@ pub fn perform_similar( } } + if let Some(ranking_score_threshold) = ranking_score_threshold { + similar.ranking_score_threshold(ranking_score_threshold.0); + } + let milli::SearchResult { documents_ids, matching_words: _, From c36410fcbf07c3153d7c888a8ae61bcfa8a2a273 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 30 May 2024 11:22:12 +0200 Subject: [PATCH 005/110] Analytics for ranking score threshold --- meilisearch/src/analytics/segment_analytics.rs | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/meilisearch/src/analytics/segment_analytics.rs b/meilisearch/src/analytics/segment_analytics.rs index 10583da1b..aed29e612 100644 --- a/meilisearch/src/analytics/segment_analytics.rs +++ b/meilisearch/src/analytics/segment_analytics.rs @@ -648,6 +648,7 @@ pub struct SearchAggregator { // scoring show_ranking_score: bool, show_ranking_score_details: bool, + ranking_score_threshold: bool, } impl SearchAggregator { @@ -749,6 +750,7 @@ impl SearchAggregator { ret.show_ranking_score = *show_ranking_score; ret.show_ranking_score_details = *show_ranking_score_details; + ret.ranking_score_threshold = ranking_score_threshold.is_some(); if let Some(hybrid) = hybrid { ret.semantic_ratio = hybrid.semantic_ratio != DEFAULT_SEMANTIC_RATIO(); @@ -822,6 +824,7 @@ impl SearchAggregator { hybrid, total_degraded, total_used_negative_operator, + ranking_score_threshold, } = other; if self.timestamp.is_none() { @@ -905,6 +908,7 @@ impl SearchAggregator { // scoring self.show_ranking_score |= show_ranking_score; self.show_ranking_score_details |= show_ranking_score_details; + self.ranking_score_threshold |= ranking_score_threshold; } pub fn into_event(self, user: &User, event_name: &str) -> Option { @@ -946,6 +950,7 @@ impl SearchAggregator { hybrid, total_degraded, total_used_negative_operator, + ranking_score_threshold, } = self; if total_received == 0 { @@ -1016,6 +1021,7 @@ impl SearchAggregator { "scoring": { "show_ranking_score": show_ranking_score, "show_ranking_score_details": show_ranking_score_details, + "ranking_score_threshold": ranking_score_threshold, }, }); @@ -1251,7 +1257,8 @@ impl FacetSearchAggregator { || filter.is_some() || *matching_strategy != MatchingStrategy::default() || attributes_to_search_on.is_some() - || hybrid.is_some(); + || hybrid.is_some() + || ranking_score_threshold.is_some(); ret } @@ -1627,6 +1634,7 @@ pub struct SimilarAggregator { // scoring show_ranking_score: bool, show_ranking_score_details: bool, + ranking_score_threshold: bool, } impl SimilarAggregator { @@ -1641,6 +1649,7 @@ impl SimilarAggregator { show_ranking_score, show_ranking_score_details, filter, + ranking_score_threshold, } = query; let mut ret = Self::default(); @@ -1678,6 +1687,7 @@ impl SimilarAggregator { ret.show_ranking_score = *show_ranking_score; ret.show_ranking_score_details = *show_ranking_score_details; + ret.ranking_score_threshold = ranking_score_threshold.is_some(); ret.embedder = embedder.is_some(); @@ -1711,6 +1721,7 @@ impl SimilarAggregator { show_ranking_score, show_ranking_score_details, embedder, + ranking_score_threshold, } = other; if self.timestamp.is_none() { @@ -1752,6 +1763,7 @@ impl SimilarAggregator { // scoring self.show_ranking_score |= show_ranking_score; self.show_ranking_score_details |= show_ranking_score_details; + self.ranking_score_threshold |= ranking_score_threshold; } pub fn into_event(self, user: &User, event_name: &str) -> Option { @@ -1772,6 +1784,7 @@ impl SimilarAggregator { show_ranking_score, show_ranking_score_details, embedder, + ranking_score_threshold, } = self; if total_received == 0 { @@ -1811,6 +1824,7 @@ impl SimilarAggregator { "scoring": { "show_ranking_score": show_ranking_score, "show_ranking_score_details": show_ranking_score_details, + "ranking_score_threshold": ranking_score_threshold, }, }); From 41976b82b1c538c8effbff2f0ad1ea5ccf4234e1 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 30 May 2024 11:22:26 +0200 Subject: [PATCH 006/110] Tests for ranking_score_threshold --- meilisearch/tests/search/errors.rs | 34 ++++ meilisearch/tests/search/mod.rs | 232 ++++++++++++++++++++++++++++ meilisearch/tests/similar/errors.rs | 62 ++++++++ meilisearch/tests/similar/mod.rs | 229 +++++++++++++++++++++++++++ 4 files changed, 557 insertions(+) diff --git a/meilisearch/tests/search/errors.rs b/meilisearch/tests/search/errors.rs index 8be70d162..9c7c361b6 100644 --- a/meilisearch/tests/search/errors.rs +++ b/meilisearch/tests/search/errors.rs @@ -321,6 +321,40 @@ async fn search_bad_facets() { // Can't make the `attributes_to_highlight` fail with a get search since it'll accept anything as an array of strings. } +#[actix_rt::test] +async fn search_bad_threshold() { + let server = Server::new().await; + let index = server.index("test"); + + let (response, code) = index.search_post(json!({"rankingScoreThreshold": "doggo"})).await; + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r###" + { + "message": "Invalid value type at `.rankingScoreThreshold`: expected a number, but found a string: `\"doggo\"`", + "code": "invalid_search_ranking_score_threshold", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_ranking_score_threshold" + } + "###); +} + +#[actix_rt::test] +async fn search_invalid_threshold() { + let server = Server::new().await; + let index = server.index("test"); + + let (response, code) = index.search_post(json!({"rankingScoreThreshold": 42})).await; + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r###" + { + "message": "Invalid value at `.rankingScoreThreshold`: the value of `rankingScoreThreshold` is invalid, expected a float between `0.0` and `1.0`.", + "code": "invalid_search_ranking_score_threshold", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_ranking_score_threshold" + } + "###); +} + #[actix_rt::test] async fn search_non_filterable_facets() { let server = Server::new().await; diff --git a/meilisearch/tests/search/mod.rs b/meilisearch/tests/search/mod.rs index 56fa226b2..7774755bc 100644 --- a/meilisearch/tests/search/mod.rs +++ b/meilisearch/tests/search/mod.rs @@ -47,6 +47,31 @@ static DOCUMENTS: Lazy = Lazy::new(|| { ]) }); +static SCORE_DOCUMENTS: Lazy = Lazy::new(|| { + json!([ + { + "title": "Batman the dark knight returns: Part 1", + "id": "A", + }, + { + "title": "Batman the dark knight returns: Part 2", + "id": "B", + }, + { + "title": "Batman Returns", + "id": "C", + }, + { + "title": "Batman", + "id": "D", + }, + { + "title": "Badman", + "id": "E", + } + ]) +}); + static NESTED_DOCUMENTS: Lazy = Lazy::new(|| { json!([ { @@ -959,6 +984,213 @@ async fn test_score_details() { .await; } +#[actix_rt::test] +async fn test_score() { + let server = Server::new().await; + let index = server.index("test"); + + let documents = SCORE_DOCUMENTS.clone(); + + let res = index.add_documents(json!(documents), None).await; + index.wait_task(res.0.uid()).await; + + index + .search( + json!({ + "q": "Badman the dark knight returns 1", + "showRankingScore": true, + }), + |response, code| { + meili_snap::snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response["hits"]), @r###" + [ + { + "title": "Batman the dark knight returns: Part 1", + "id": "A", + "_rankingScore": 0.9746605609456898 + }, + { + "title": "Batman the dark knight returns: Part 2", + "id": "B", + "_rankingScore": 0.8055252965383685 + }, + { + "title": "Badman", + "id": "E", + "_rankingScore": 0.16666666666666666 + }, + { + "title": "Batman Returns", + "id": "C", + "_rankingScore": 0.07702020202020202 + }, + { + "title": "Batman", + "id": "D", + "_rankingScore": 0.07702020202020202 + } + ] + "###); + }, + ) + .await; +} + +#[actix_rt::test] +async fn test_score_threshold() { + let query = "Badman dark returns 1"; + let server = Server::new().await; + let index = server.index("test"); + + let documents = SCORE_DOCUMENTS.clone(); + + let res = index.add_documents(json!(documents), None).await; + index.wait_task(res.0.uid()).await; + + index + .search( + json!({ + "q": query, + "showRankingScore": true, + "rankingScoreThreshold": 0.0 + }), + |response, code| { + meili_snap::snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response["estimatedTotalHits"]), @"5"); + meili_snap::snapshot!(meili_snap::json_string!(response["hits"]), @r###" + [ + { + "title": "Batman the dark knight returns: Part 1", + "id": "A", + "_rankingScore": 0.93430081300813 + }, + { + "title": "Batman the dark knight returns: Part 2", + "id": "B", + "_rankingScore": 0.6685627880184332 + }, + { + "title": "Badman", + "id": "E", + "_rankingScore": 0.25 + }, + { + "title": "Batman Returns", + "id": "C", + "_rankingScore": 0.11553030303030302 + }, + { + "title": "Batman", + "id": "D", + "_rankingScore": 0.11553030303030302 + } + ] + "###); + }, + ) + .await; + + index + .search( + json!({ + "q": query, + "showRankingScore": true, + "rankingScoreThreshold": 0.2 + }), + |response, code| { + meili_snap::snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response["estimatedTotalHits"]), @r###"3"###); + meili_snap::snapshot!(meili_snap::json_string!(response["hits"]), @r###" + [ + { + "title": "Batman the dark knight returns: Part 1", + "id": "A", + "_rankingScore": 0.93430081300813 + }, + { + "title": "Batman the dark knight returns: Part 2", + "id": "B", + "_rankingScore": 0.6685627880184332 + }, + { + "title": "Badman", + "id": "E", + "_rankingScore": 0.25 + } + ] + "###); + }, + ) + .await; + + index + .search( + json!({ + "q": query, + "showRankingScore": true, + "rankingScoreThreshold": 0.5 + }), + |response, code| { + meili_snap::snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response["estimatedTotalHits"]), @r###"2"###); + meili_snap::snapshot!(meili_snap::json_string!(response["hits"]), @r###" + [ + { + "title": "Batman the dark knight returns: Part 1", + "id": "A", + "_rankingScore": 0.93430081300813 + }, + { + "title": "Batman the dark knight returns: Part 2", + "id": "B", + "_rankingScore": 0.6685627880184332 + } + ] + "###); + }, + ) + .await; + + index + .search( + json!({ + "q": query, + "showRankingScore": true, + "rankingScoreThreshold": 0.8 + }), + |response, code| { + meili_snap::snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response["estimatedTotalHits"]), @r###"1"###); + meili_snap::snapshot!(meili_snap::json_string!(response["hits"]), @r###" + [ + { + "title": "Batman the dark knight returns: Part 1", + "id": "A", + "_rankingScore": 0.93430081300813 + } + ] + "###); + }, + ) + .await; + + index + .search( + json!({ + "q": query, + "showRankingScore": true, + "rankingScoreThreshold": 1.0 + }), + |response, code| { + meili_snap::snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response["estimatedTotalHits"]), @r###"0"###); + // nobody is perfect + meili_snap::snapshot!(meili_snap::json_string!(response["hits"]), @"[]"); + }, + ) + .await; +} + #[actix_rt::test] async fn test_degraded_score_details() { let server = Server::new().await; diff --git a/meilisearch/tests/similar/errors.rs b/meilisearch/tests/similar/errors.rs index 64386a7bf..7765b9a85 100644 --- a/meilisearch/tests/similar/errors.rs +++ b/meilisearch/tests/similar/errors.rs @@ -87,6 +87,68 @@ async fn similar_bad_id() { "###); } +#[actix_rt::test] +async fn similar_bad_ranking_score_threshold() { + let server = Server::new().await; + let index = server.index("test"); + server.set_features(json!({"vectorStore": true})).await; + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "manual": { + "source": "userProvided", + "dimensions": 3, + } + }, + "filterableAttributes": ["title"]})) + .await; + snapshot!(code, @"202 Accepted"); + server.wait_task(response.uid()).await; + + let (response, code) = index.similar_post(json!({"rankingScoreThreshold": ["doggo"]})).await; + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r###" + { + "message": "Invalid value type at `.rankingScoreThreshold`: expected a number, but found an array: `[\"doggo\"]`", + "code": "invalid_similar_ranking_score_threshold", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_similar_ranking_score_threshold" + } + "###); +} + +#[actix_rt::test] +async fn similar_invalid_ranking_score_threshold() { + let server = Server::new().await; + let index = server.index("test"); + server.set_features(json!({"vectorStore": true})).await; + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "manual": { + "source": "userProvided", + "dimensions": 3, + } + }, + "filterableAttributes": ["title"]})) + .await; + snapshot!(code, @"202 Accepted"); + server.wait_task(response.uid()).await; + + let (response, code) = index.similar_post(json!({"rankingScoreThreshold": 42})).await; + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r###" + { + "message": "Invalid value at `.rankingScoreThreshold`: the value of `rankingScoreThreshold` is invalid, expected a float between `0.0` and `1.0`.", + "code": "invalid_similar_ranking_score_threshold", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_similar_ranking_score_threshold" + } + "###); +} + #[actix_rt::test] async fn similar_invalid_id() { let server = Server::new().await; diff --git a/meilisearch/tests/similar/mod.rs b/meilisearch/tests/similar/mod.rs index ee78917cb..bde23b67f 100644 --- a/meilisearch/tests/similar/mod.rs +++ b/meilisearch/tests/similar/mod.rs @@ -194,6 +194,235 @@ async fn basic() { .await; } +#[actix_rt::test] +async fn ranking_score_threshold() { + let server = Server::new().await; + let index = server.index("test"); + let (value, code) = server.set_features(json!({"vectorStore": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(value, @r###" + { + "vectorStore": true, + "metrics": false, + "logsRoute": false + } + "###); + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "manual": { + "source": "userProvided", + "dimensions": 3, + } + }, + "filterableAttributes": ["title"]})) + .await; + snapshot!(code, @"202 Accepted"); + server.wait_task(response.uid()).await; + + let documents = DOCUMENTS.clone(); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + index.wait_task(value.uid()).await; + + index + .similar( + json!({"id": 143, "showRankingScore": true, "rankingScoreThreshold": 0}), + |response, code| { + snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response["estimatedTotalHits"]), @"4"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "title": "Escape Room", + "release_year": 2019, + "id": "522681", + "_vectors": { + "manual": [ + 0.1, + 0.6, + 0.8 + ] + }, + "_rankingScore": 0.890957772731781 + }, + { + "title": "Captain Marvel", + "release_year": 2019, + "id": "299537", + "_vectors": { + "manual": [ + 0.6, + 0.8, + -0.2 + ] + }, + "_rankingScore": 0.39060014486312866 + }, + { + "title": "How to Train Your Dragon: The Hidden World", + "release_year": 2019, + "id": "166428", + "_vectors": { + "manual": [ + 0.7, + 0.7, + -0.4 + ] + }, + "_rankingScore": 0.2819308042526245 + }, + { + "title": "Shazam!", + "release_year": 2019, + "id": "287947", + "_vectors": { + "manual": [ + 0.8, + 0.4, + -0.5 + ] + }, + "_rankingScore": 0.1662663221359253 + } + ] + "###); + }, + ) + .await; + + index + .similar( + json!({"id": 143, "showRankingScore": true, "rankingScoreThreshold": 0.2}), + |response, code| { + snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response["estimatedTotalHits"]), @"3"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "title": "Escape Room", + "release_year": 2019, + "id": "522681", + "_vectors": { + "manual": [ + 0.1, + 0.6, + 0.8 + ] + }, + "_rankingScore": 0.890957772731781 + }, + { + "title": "Captain Marvel", + "release_year": 2019, + "id": "299537", + "_vectors": { + "manual": [ + 0.6, + 0.8, + -0.2 + ] + }, + "_rankingScore": 0.39060014486312866 + }, + { + "title": "How to Train Your Dragon: The Hidden World", + "release_year": 2019, + "id": "166428", + "_vectors": { + "manual": [ + 0.7, + 0.7, + -0.4 + ] + }, + "_rankingScore": 0.2819308042526245 + } + ] + "###); + }, + ) + .await; + + index + .similar( + json!({"id": 143, "showRankingScore": true, "rankingScoreThreshold": 0.3}), + |response, code| { + snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response["estimatedTotalHits"]), @"2"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "title": "Escape Room", + "release_year": 2019, + "id": "522681", + "_vectors": { + "manual": [ + 0.1, + 0.6, + 0.8 + ] + }, + "_rankingScore": 0.890957772731781 + }, + { + "title": "Captain Marvel", + "release_year": 2019, + "id": "299537", + "_vectors": { + "manual": [ + 0.6, + 0.8, + -0.2 + ] + }, + "_rankingScore": 0.39060014486312866 + } + ] + "###); + }, + ) + .await; + + index + .similar( + json!({"id": 143, "showRankingScore": true, "rankingScoreThreshold": 0.6}), + |response, code| { + snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response["estimatedTotalHits"]), @"1"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "title": "Escape Room", + "release_year": 2019, + "id": "522681", + "_vectors": { + "manual": [ + 0.1, + 0.6, + 0.8 + ] + }, + "_rankingScore": 0.890957772731781 + } + ] + "###); + }, + ) + .await; + + index + .similar( + json!({"id": 143, "showRankingScore": true, "rankingScoreThreshold": 0.9}), + |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @"[]"); + }, + ) + .await; +} + #[actix_rt::test] async fn filter() { let server = Server::new().await; From c2fb7afe5906de1ebe674421f03106841ce39f8d Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 30 May 2024 12:06:46 +0200 Subject: [PATCH 007/110] fmt --- meilisearch/src/routes/indexes/facet_search.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/meilisearch/src/routes/indexes/facet_search.rs b/meilisearch/src/routes/indexes/facet_search.rs index 845b476fe..10b371f2d 100644 --- a/meilisearch/src/routes/indexes/facet_search.rs +++ b/meilisearch/src/routes/indexes/facet_search.rs @@ -14,7 +14,9 @@ use crate::extractors::authentication::policies::*; use crate::extractors::authentication::GuardedData; use crate::routes::indexes::search::search_kind; use crate::search::{ - add_search_rules, perform_facet_search, HybridQuery, MatchingStrategy, RankingScoreThreshold, SearchQuery, DEFAULT_CROP_LENGTH, DEFAULT_CROP_MARKER, DEFAULT_HIGHLIGHT_POST_TAG, DEFAULT_HIGHLIGHT_PRE_TAG, DEFAULT_SEARCH_LIMIT, DEFAULT_SEARCH_OFFSET + add_search_rules, perform_facet_search, HybridQuery, MatchingStrategy, RankingScoreThreshold, + SearchQuery, DEFAULT_CROP_LENGTH, DEFAULT_CROP_MARKER, DEFAULT_HIGHLIGHT_POST_TAG, + DEFAULT_HIGHLIGHT_PRE_TAG, DEFAULT_SEARCH_LIMIT, DEFAULT_SEARCH_OFFSET, }; use crate::search_queue::SearchQueue; From 2b6db6541e2f1667c6d42ed72fcadcaacc311f41 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 3 Jun 2024 10:28:15 +0200 Subject: [PATCH 008/110] Changes after review --- meilisearch/src/routes/indexes/search.rs | 2 +- meilisearch/src/search.rs | 4 +++- milli/src/search/mod.rs | 7 ++----- milli/src/search/new/bucket_sort.rs | 22 +++++++++++----------- 4 files changed, 17 insertions(+), 18 deletions(-) diff --git a/meilisearch/src/routes/indexes/search.rs b/meilisearch/src/routes/indexes/search.rs index 7f5acbd37..348d8295c 100644 --- a/meilisearch/src/routes/indexes/search.rs +++ b/meilisearch/src/routes/indexes/search.rs @@ -83,7 +83,7 @@ pub struct SearchQueryGet { pub hybrid_embedder: Option, #[deserr(default, error = DeserrQueryParamError)] pub hybrid_semantic_ratio: Option, - #[deserr(default, error = DeserrQueryParamError, default)] + #[deserr(default, error = DeserrQueryParamError)] pub ranking_score_threshold: Option, } diff --git a/meilisearch/src/search.rs b/meilisearch/src/search.rs index 23f9d3f79..2e218c73c 100644 --- a/meilisearch/src/search.rs +++ b/meilisearch/src/search.rs @@ -709,7 +709,9 @@ fn prepare_search<'t>( ) -> Result<(milli::Search<'t>, bool, usize, usize), MeilisearchHttpError> { let mut search = index.search(rtxn); search.time_budget(time_budget); - search.ranking_score_threshold(query.ranking_score_threshold.map(|rst| rst.0)); + if let Some(ranking_score_threshold) = query.ranking_score_threshold { + search.ranking_score_threshold(ranking_score_threshold.0); + } match search_kind { SearchKind::KeywordOnly => { diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index f7bcf6e7b..cbdd3af39 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -148,11 +148,8 @@ impl<'a> Search<'a> { self } - pub fn ranking_score_threshold( - &mut self, - ranking_score_threshold: Option, - ) -> &mut Search<'a> { - self.ranking_score_threshold = ranking_score_threshold; + pub fn ranking_score_threshold(&mut self, ranking_score_threshold: f64) -> &mut Search<'a> { + self.ranking_score_threshold = Some(ranking_score_threshold); self } diff --git a/milli/src/search/new/bucket_sort.rs b/milli/src/search/new/bucket_sort.rs index b15e735d0..d937c78bf 100644 --- a/milli/src/search/new/bucket_sort.rs +++ b/milli/src/search/new/bucket_sort.rs @@ -145,7 +145,6 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( ctx, from, length, - ranking_score_threshold, logger, &mut valid_docids, &mut valid_scores, @@ -167,6 +166,16 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( let bucket = std::mem::take(&mut ranking_rule_universes[cur_ranking_rule_index]); ranking_rule_scores.push(ScoreDetails::Skipped); + // remove candidates from the universe without adding them to result if their score is below the threshold + if let Some(ranking_score_threshold) = ranking_score_threshold { + let current_score = ScoreDetails::global_score(ranking_rule_scores.iter()); + if current_score < ranking_score_threshold { + all_candidates -= bucket | &ranking_rule_universes[cur_ranking_rule_index]; + back!(); + continue; + } + } + maybe_add_to_results!(bucket); ranking_rule_scores.pop(); @@ -225,6 +234,7 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( ranking_rule_universes[cur_ranking_rule_index].is_superset(&next_bucket.candidates) ); + // remove candidates from the universe without adding them to result if their score is below the threshold if let Some(ranking_score_threshold) = ranking_score_threshold { let current_score = ScoreDetails::global_score(ranking_rule_scores.iter()); if current_score < ranking_score_threshold { @@ -277,7 +287,6 @@ fn maybe_add_to_results<'ctx, Q: RankingRuleQueryTrait>( ctx: &mut SearchContext<'ctx>, from: usize, length: usize, - ranking_score_threshold: Option, logger: &mut dyn SearchLogger, valid_docids: &mut Vec, @@ -295,15 +304,6 @@ fn maybe_add_to_results<'ctx, Q: RankingRuleQueryTrait>( ranking_rule_scores: &[ScoreDetails], candidates: RoaringBitmap, ) -> Result<()> { - // remove candidates from the universe without adding them to result if their score is below the threshold - if let Some(ranking_score_threshold) = ranking_score_threshold { - let score = ScoreDetails::global_score(ranking_rule_scores.iter()); - if score < ranking_score_threshold { - *all_candidates -= candidates | &ranking_rule_universes[cur_ranking_rule_index]; - return Ok(()); - } - } - // First apply the distinct rule on the candidates, reducing the universes if necessary let candidates = if let Some(distinct_fid) = distinct_fid { let DistinctOutput { remaining, excluded } = From 84126659573570dd3bcdcdd5ee774631793f981d Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 4 Jun 2024 09:54:30 +0200 Subject: [PATCH 009/110] Update actix-web 4.5.1 -> 4.6.0 --- Cargo.lock | 45 +++++++++++++++++++++--------------- meilisearch-types/Cargo.toml | 9 ++++++-- meilisearch/Cargo.toml | 10 ++++---- 3 files changed, 39 insertions(+), 25 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e31943cf3..96c119a19 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -36,9 +36,9 @@ dependencies = [ [[package]] name = "actix-http" -version = "3.6.0" +version = "3.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d223b13fd481fc0d1f83bb12659ae774d9e3601814c68a0bc539731698cca743" +checksum = "4eb9843d84c775696c37d9a418bbb01b932629d01870722c0f13eb3f95e2536d" dependencies = [ "actix-codec", "actix-rt", @@ -46,7 +46,7 @@ dependencies = [ "actix-tls", "actix-utils", "ahash", - "base64 0.21.7", + "base64 0.22.1", "bitflags 2.5.0", "brotli", "bytes", @@ -85,13 +85,15 @@ dependencies = [ [[package]] name = "actix-router" -version = "0.5.1" +version = "0.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d66ff4d247d2b160861fa2866457e85706833527840e4133f8f49aa423a38799" +checksum = "13d324164c51f63867b57e73ba5936ea151b8a41a1d23d1031eeb9f70d0236f8" dependencies = [ "bytestring", + "cfg-if", "http 0.2.11", "regex", + "regex-lite", "serde", "tracing", ] @@ -138,9 +140,9 @@ dependencies = [ [[package]] name = "actix-tls" -version = "3.3.0" +version = "3.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d4cce60a2f2b477bc72e5cde0af1812a6e82d8fd85b5570a5dcf2a5bf2c5be5f" +checksum = "ac453898d866cdbecdbc2334fe1738c747b4eba14a677261f2b768ba05329389" dependencies = [ "actix-rt", "actix-service", @@ -167,9 +169,9 @@ dependencies = [ [[package]] name = "actix-web" -version = "4.5.1" +version = "4.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43a6556ddebb638c2358714d853257ed226ece6023ef9364f23f0c70737ea984" +checksum = "b1cf67dadb19d7c95e5a299e2dda24193b89d5d4f33a3b9800888ede9e19aa32" dependencies = [ "actix-codec", "actix-http", @@ -196,7 +198,7 @@ dependencies = [ "mime", "once_cell", "pin-project-lite", - "regex", + "regex-lite", "serde", "serde_json", "serde_urlencoded", @@ -220,8 +222,9 @@ dependencies = [ [[package]] name = "actix-web-static-files" -version = "3.0.5" -source = "git+https://github.com/kilork/actix-web-static-files.git?rev=2d3b6160#2d3b6160f0de4ba061c5d76b5704f34fb677f6df" +version = "4.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "adf6d1ef6d7a60e084f9e0595e2a5234abda14e76c105ecf8e2d0e8800c41a1f" dependencies = [ "actix-web", "derive_more", @@ -613,9 +616,9 @@ dependencies = [ [[package]] name = "brotli" -version = "3.4.0" +version = "6.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "516074a47ef4bce09577a3b379392300159ce5b1ba2e501ff1c819950066100f" +checksum = "74f7971dbd9326d58187408ab83117d8ac1bb9c17b085fdacd1cf2f598719b6b" dependencies = [ "alloc-no-stdlib", "alloc-stdlib", @@ -624,9 +627,9 @@ dependencies = [ [[package]] name = "brotli-decompressor" -version = "2.5.1" +version = "4.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4e2e4afe60d7dd600fdd3de8d0f08c2b7ec039712e3b6137ff98b7004e82de4f" +checksum = "9a45bd2e4095a8b518033b128020dd4a55aab1c0a381ba4404a472630f4bc362" dependencies = [ "alloc-no-stdlib", "alloc-stdlib", @@ -4340,6 +4343,12 @@ dependencies = [ "regex-syntax", ] +[[package]] +name = "regex-lite" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "30b661b2f27137bdbc16f00eda72866a92bb28af1753ffbd56744fb6e2e9cd8e" + [[package]] name = "regex-syntax" version = "0.8.2" @@ -5313,9 +5322,9 @@ dependencies = [ [[package]] name = "tracing-actix-web" -version = "0.7.9" +version = "0.7.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1fe0d5feac3f4ca21ba33496bcb1ccab58cca6412b1405ae80f0581541e0ca78" +checksum = "fa069bd1503dd526ee793bb3fce408895136c95fc86d2edb2acf1c646d7f0684" dependencies = [ "actix-web", "mutually_exclusive_features", diff --git a/meilisearch-types/Cargo.toml b/meilisearch-types/Cargo.toml index 6d23f144a..f840ceb7e 100644 --- a/meilisearch-types/Cargo.toml +++ b/meilisearch-types/Cargo.toml @@ -11,7 +11,7 @@ edition.workspace = true license.workspace = true [dependencies] -actix-web = { version = "4.5.1", default-features = false } +actix-web = { version = "4.6.0", default-features = false } anyhow = "1.0.79" convert_case = "0.6.0" csv = "1.3.0" @@ -30,7 +30,12 @@ serde_json = "1.0.111" tar = "0.4.40" tempfile = "3.9.0" thiserror = "1.0.56" -time = { version = "0.3.31", features = ["serde-well-known", "formatting", "parsing", "macros"] } +time = { version = "0.3.31", features = [ + "serde-well-known", + "formatting", + "parsing", + "macros", +] } tokio = "1.35" uuid = { version = "1.6.1", features = ["serde", "v4"] } diff --git a/meilisearch/Cargo.toml b/meilisearch/Cargo.toml index 75962c450..ebcbbd266 100644 --- a/meilisearch/Cargo.toml +++ b/meilisearch/Cargo.toml @@ -14,20 +14,20 @@ default-run = "meilisearch" [dependencies] actix-cors = "0.7.0" -actix-http = { version = "3.6.0", default-features = false, features = [ +actix-http = { version = "3.7.0", default-features = false, features = [ "compress-brotli", "compress-gzip", "rustls-0_21", ] } actix-utils = "3.0.1" -actix-web = { version = "4.5.1", default-features = false, features = [ +actix-web = { version = "4.6.0", default-features = false, features = [ "macros", "compress-brotli", "compress-gzip", "cookies", "rustls-0_21", ] } -actix-web-static-files = { git = "https://github.com/kilork/actix-web-static-files.git", rev = "2d3b6160", optional = true } +actix-web-static-files = { version = "4.0.1", optional = true } anyhow = { version = "1.0.79", features = ["backtrace"] } async-stream = "0.3.5" async-trait = "0.1.77" @@ -105,13 +105,13 @@ url = { version = "2.5.0", features = ["serde"] } tracing = "0.1.40" tracing-subscriber = { version = "0.3.18", features = ["json"] } tracing-trace = { version = "0.1.0", path = "../tracing-trace" } -tracing-actix-web = "0.7.9" +tracing-actix-web = "0.7.10" build-info = { version = "1.7.0", path = "../build-info" } [dev-dependencies] actix-rt = "2.9.0" assert-json-diff = "2.0.2" -brotli = "3.4.0" +brotli = "6.0.0" insta = "1.34.0" manifest-dir-macros = "0.1.18" maplit = "1.0.2" From 42b3f52ef95416ca9f13ac46e70b77d24ba1deb6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 28 May 2024 11:46:53 +0200 Subject: [PATCH 010/110] Introduce the SettingDiff only_additional_fields method --- milli/src/update/settings.rs | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 24b32b6fa..c3e4ab3fa 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -1112,6 +1112,26 @@ impl InnerIndexSettingsDiff { || self.old.proximity_precision != self.new.proximity_precision } + /// Returns only the additional searchable fields if any + /// other searchable field has been modified, returns None. + pub fn only_additional_fields(&self) -> Option> { + match (&self.old.user_defined_searchable_fields, &self.new.user_defined_searchable_fields) { + (None, None) | (Some(_), None) => None, + (None, Some(new)) => Some(new.iter().cloned().collect()), + (Some(old), Some(new)) => { + let old: HashSet<_> = old.iter().cloned().collect(); + let new: HashSet<_> = new.iter().cloned().collect(); + if old.difference(&new).next().is_none() { + // if no field has been removed + // return only the additional ones + Some(&new - &old) + } else { + None + } + } + } + } + pub fn reindex_facets(&self) -> bool { let existing_fields = &self.new.existing_fields; if existing_fields.iter().any(|field| field.contains('.')) { From 0c6e4b2f0098cd4621ceda79402e5bc634a80c59 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 28 May 2024 14:53:45 +0200 Subject: [PATCH 011/110] Introducing a new into_del_add_obkv_conditional_operation function --- milli/src/update/del_add.rs | 15 ++++++ milli/src/update/index_documents/transform.rs | 47 +++++++++++++++---- 2 files changed, 53 insertions(+), 9 deletions(-) diff --git a/milli/src/update/del_add.rs b/milli/src/update/del_add.rs index 0288858ed..cb5e448f1 100644 --- a/milli/src/update/del_add.rs +++ b/milli/src/update/del_add.rs @@ -40,11 +40,26 @@ pub fn into_del_add_obkv( operation: DelAddOperation, buffer: &mut Vec, ) -> Result<(), std::io::Error> { + into_del_add_obkv_conditional_operation(reader, buffer, |_| operation) +} + +/// Akin to the [into_del_add_obkv] function but lets you +/// conditionally define the `DelAdd` variant based on the obkv key. +pub fn into_del_add_obkv_conditional_operation( + reader: obkv::KvReader, + buffer: &mut Vec, + operation: F, +) -> std::io::Result<()> +where + K: obkv::Key + PartialOrd, + F: Fn(K) -> DelAddOperation, +{ let mut writer = obkv::KvWriter::new(buffer); let mut value_buffer = Vec::new(); for (key, value) in reader.iter() { value_buffer.clear(); let mut value_writer = KvWriterDelAdd::new(&mut value_buffer); + let operation = operation(key); if matches!(operation, DelAddOperation::Deletion | DelAddOperation::DeletionAndAddition) { value_writer.insert(DelAdd::Deletion, value)?; } diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 41a0a55cf..dc6642b8a 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -20,7 +20,10 @@ use super::{IndexDocumentsMethod, IndexerConfig}; use crate::documents::{DocumentsBatchIndex, EnrichedDocument, EnrichedDocumentsBatchReader}; use crate::error::{Error, InternalError, UserError}; use crate::index::{db_name, main_key}; -use crate::update::del_add::{into_del_add_obkv, DelAdd, DelAddOperation, KvReaderDelAdd}; +use crate::update::del_add::{ + into_del_add_obkv, into_del_add_obkv_conditional_operation, DelAdd, DelAddOperation, + KvReaderDelAdd, +}; use crate::update::index_documents::GrenadParameters; use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff}; use crate::update::{AvailableDocumentsIds, UpdateIndexingStep}; @@ -841,13 +844,28 @@ impl<'a, 'i> Transform<'a, 'i> { let is_primary_key = |id: FieldId| -> bool { settings_diff.primary_key_id == Some(id) }; // If only the `searchableAttributes` has been changed, keep only the searchable fields. + // However, if only new searchable attributes are added, this function will + // return false has fields do not need to be reindexed. let must_reindex_searchables = settings_diff.reindex_searchable(); - let necessary_searchable_field = |id: FieldId| -> bool { - must_reindex_searchables + let must_index_only_additional_searchables = &settings_diff.only_additional_fields(); + let necessary_searchable_field_to_reindex = move |id: FieldId| -> bool { + must_index_only_additional_searchables.is_none() + && must_reindex_searchables && (settings_diff.old.searchable_fields_ids.contains(&id) || settings_diff.new.searchable_fields_ids.contains(&id)) }; + // If only new `searchableAttributes` are present, keep only those ones. + let additional_searchable_field_only = move |id: FieldId| -> bool { + match must_index_only_additional_searchables { + Some(additional_fields) => { + let additional_field = settings_diff.new.fields_ids_map.name(id).unwrap(); + additional_fields.contains(additional_field) + } + None => false, + } + }; + // If only a faceted field has been added, keep only this field. let must_reindex_facets = settings_diff.reindex_facets(); let necessary_faceted_field = |id: FieldId| -> bool { @@ -862,14 +880,21 @@ impl<'a, 'i> Transform<'a, 'i> { // we need the fields for the prompt/templating. let reindex_vectors = settings_diff.reindex_vectors(); + // The set of additional searchable fields only, + // the only purpose of these fields is to be indexed from scratch. + let mut additional_searchables_only = HashSet::new(); + let mut obkv_writer = KvWriter::<_, FieldId>::memory(); for (id, val) in old_obkv.iter() { if is_primary_key(id) - || necessary_searchable_field(id) + || necessary_searchable_field_to_reindex(id) || necessary_faceted_field(id) || reindex_vectors { obkv_writer.insert(id, val)?; + } else if additional_searchable_field_only(id) { + additional_searchables_only.insert(id); + obkv_writer.insert(id, val)?; } } let data = obkv_writer.into_inner()?; @@ -887,11 +912,15 @@ impl<'a, 'i> Transform<'a, 'i> { let flattened = flattened.as_deref().map_or(obkv, KvReader::new); flattened_obkv_buffer.clear(); - into_del_add_obkv( - flattened, - DelAddOperation::DeletionAndAddition, - flattened_obkv_buffer, - )?; + into_del_add_obkv_conditional_operation(flattened, flattened_obkv_buffer, |id| { + // If the field is only required because it is an additional + // searchable field only define it as an DelAdd::Addition only. + if additional_searchables_only.contains(&id) { + DelAddOperation::Addition + } else { + DelAddOperation::DeletionAndAddition + } + })?; } Ok(()) From 1ab03c4ede545acf90d0c1afa2c0e715c8bbd47a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 29 May 2024 14:49:09 +0200 Subject: [PATCH 012/110] Fix an issue with settings diff and * in the searchable attributes --- milli/src/update/settings.rs | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index c3e4ab3fa..af37a205c 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -1112,12 +1112,11 @@ impl InnerIndexSettingsDiff { || self.old.proximity_precision != self.new.proximity_precision } - /// Returns only the additional searchable fields if any - /// other searchable field has been modified, returns None. + /// Returns only the additional searchable fields. + /// If any other searchable field has been modified, returns None. pub fn only_additional_fields(&self) -> Option> { match (&self.old.user_defined_searchable_fields, &self.new.user_defined_searchable_fields) { - (None, None) | (Some(_), None) => None, - (None, Some(new)) => Some(new.iter().cloned().collect()), + (None, None) | (Some(_), None) | (None, Some(_)) => None, // None means * (Some(old), Some(new)) => { let old: HashSet<_> = old.iter().cloned().collect(); let new: HashSet<_> = new.iter().cloned().collect(); From fad4675abeb8defa7a38544edc6b01491e49af8f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 29 May 2024 14:50:00 +0200 Subject: [PATCH 013/110] Give the settings diff to the write_typed_chunk_into_index function --- milli/src/update/index_documents/mod.rs | 5 +++-- milli/src/update/index_documents/typed_chunk.rs | 4 +++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index afae8973a..2420463b4 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -369,6 +369,7 @@ where // Run extraction pipeline in parallel. pool.install(|| { + let settings_diff_cloned = settings_diff.clone(); rayon::spawn(move || { let child_span = tracing::trace_span!(target: "indexing::details", parent: ¤t_span, "extract_and_send_grenad_chunks"); let _enter = child_span.enter(); @@ -398,7 +399,7 @@ where pool_params, lmdb_writer_sx.clone(), primary_key_id, - settings_diff.clone(), + settings_diff_cloned, max_positions_per_attributes, ) }); @@ -425,7 +426,7 @@ where Err(status) => { if let Some(typed_chunks) = chunk_accumulator.pop_longest() { let (docids, is_merged_database) = - write_typed_chunk_into_index(typed_chunks, self.index, self.wtxn)?; + write_typed_chunk_into_index(self.wtxn, self.index, &settings_diff, typed_chunks)?; if !docids.is_empty() { final_documents_ids |= docids; let documents_seen_count = final_documents_ids.len(); diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 2ef7a8990..fcd8cfc17 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -123,8 +123,10 @@ impl TypedChunk { #[tracing::instrument(level = "trace", skip_all, target = "indexing::write_db")] pub(crate) fn write_typed_chunk_into_index( typed_chunks: Vec, - index: &Index, wtxn: &mut RwTxn, + index: &Index, + settings_diff: &InnerIndexSettingsDiff, + typed_chunks: Vec, ) -> Result<(RoaringBitmap, bool)> { let mut is_merged_database = false; match typed_chunks[0] { From 0f578348f19548bb9bb6dd0b374260c1f75649c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 29 May 2024 15:17:51 +0200 Subject: [PATCH 014/110] Introduce a dedicated function to write proximity entries in database --- .../src/update/index_documents/typed_chunk.rs | 79 ++++++++++++++++--- 1 file changed, 69 insertions(+), 10 deletions(-) diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index fcd8cfc17..4021dea08 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -7,7 +7,7 @@ use bytemuck::allocation::pod_collect_to_vec; use charabia::{Language, Script}; use grenad::{Merger, MergerBuilder}; use heed::types::Bytes; -use heed::RwTxn; +use heed::{BytesDecode, RwTxn}; use obkv::{KvReader, KvWriter}; use roaring::RoaringBitmap; @@ -20,13 +20,16 @@ use super::MergeFn; use crate::external_documents_ids::{DocumentOperation, DocumentOperationKind}; use crate::facet::FacetType; use crate::index::db_name::DOCUMENTS; +use crate::proximity::MAX_DISTANCE; use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvReaderDelAdd}; use crate::update::facet::FacetsUpdate; use crate::update::index_documents::helpers::{ as_cloneable_grenad, keep_latest_obkv, try_split_array_at, }; +use crate::update::settings::InnerIndexSettingsDiff; use crate::{ - lat_lng_to_xyz, DocumentId, FieldId, GeoPoint, Index, InternalError, Result, SerializationError, + lat_lng_to_xyz, CboRoaringBitmapCodec, DocumentId, FieldId, GeoPoint, Index, InternalError, + Result, SerializationError, U8StrStrCodec, }; /// This struct accumulates and group the TypedChunks @@ -122,7 +125,6 @@ impl TypedChunk { /// Return new documents seen. #[tracing::instrument(level = "trace", skip_all, target = "indexing::write_db")] pub(crate) fn write_typed_chunk_into_index( - typed_chunks: Vec, wtxn: &mut RwTxn, index: &Index, settings_diff: &InnerIndexSettingsDiff, @@ -487,13 +489,22 @@ pub(crate) fn write_typed_chunk_into_index( } let merger = builder.build(); - write_entries_into_database( - merger, - &index.word_pair_proximity_docids, - wtxn, - deladd_serialize_add_side, - merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, - )?; + if settings_diff.only_additional_fields().is_some() { + write_proximity_entries_into_database_additional_searchables( + merger, + &index.word_pair_proximity_docids, + wtxn, + )?; + } else { + write_entries_into_database( + merger, + &index.word_pair_proximity_docids, + wtxn, + deladd_serialize_add_side, + merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, + )?; + } + is_merged_database = true; } TypedChunk::FieldIdDocidFacetNumbers(_) => { @@ -832,3 +843,51 @@ where } Ok(()) } + +/// Akin to the `write_entries_into_database` function but specialized +/// for the case when we only index additional searchable fields only. +#[tracing::instrument(level = "trace", skip_all, target = "indexing::write_db")] +fn write_proximity_entries_into_database_additional_searchables( + merger: Merger, + database: &heed::Database, + wtxn: &mut RwTxn, +) -> Result<()> +where + R: io::Read + io::Seek, +{ + let mut iter = merger.into_stream_merger_iter()?; + while let Some((key, value)) = iter.next()? { + if valid_lmdb_key(key) { + let (proximity_to_insert, word1, word2) = + U8StrStrCodec::bytes_decode(key).map_err(heed::Error::Decoding)?; + let data_to_insert = match KvReaderDelAdd::new(value).get(DelAdd::Addition) { + Some(value) => { + CboRoaringBitmapCodec::bytes_decode(value).map_err(heed::Error::Decoding)? + } + None => continue, + }; + + let mut data_to_remove = RoaringBitmap::new(); + for prox in 1..(MAX_DISTANCE as u8) { + let key = (prox, word1, word2); + let database_value = database.get(wtxn, &key)?.unwrap_or_default(); + let value = if prox == proximity_to_insert { + // Proximity that should be changed. + // Union values and remove lower proximity data + (&database_value | &data_to_insert) - &data_to_remove + } else { + // Remove lower proximity data + &database_value - &data_to_remove + }; + + // add the current data in data_to_remove for the next proximities + data_to_remove |= &value; + + if database_value != value { + database.put(wtxn, &key, &value)?; + } + } + } + } + Ok(()) +} From 87cf8a3c94859433da9a50bec37f854b3061f17c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 29 May 2024 17:46:28 +0200 Subject: [PATCH 015/110] Introduce a new way to determine the operations to perform on the fields --- milli/src/update/index_documents/transform.rs | 47 +++---------------- milli/src/update/settings.rs | 28 ++++++++++- 2 files changed, 34 insertions(+), 41 deletions(-) diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index dc6642b8a..34e40b7f6 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -843,29 +843,6 @@ impl<'a, 'i> Transform<'a, 'i> { // Always keep the primary key. let is_primary_key = |id: FieldId| -> bool { settings_diff.primary_key_id == Some(id) }; - // If only the `searchableAttributes` has been changed, keep only the searchable fields. - // However, if only new searchable attributes are added, this function will - // return false has fields do not need to be reindexed. - let must_reindex_searchables = settings_diff.reindex_searchable(); - let must_index_only_additional_searchables = &settings_diff.only_additional_fields(); - let necessary_searchable_field_to_reindex = move |id: FieldId| -> bool { - must_index_only_additional_searchables.is_none() - && must_reindex_searchables - && (settings_diff.old.searchable_fields_ids.contains(&id) - || settings_diff.new.searchable_fields_ids.contains(&id)) - }; - - // If only new `searchableAttributes` are present, keep only those ones. - let additional_searchable_field_only = move |id: FieldId| -> bool { - match must_index_only_additional_searchables { - Some(additional_fields) => { - let additional_field = settings_diff.new.fields_ids_map.name(id).unwrap(); - additional_fields.contains(additional_field) - } - None => false, - } - }; - // If only a faceted field has been added, keep only this field. let must_reindex_facets = settings_diff.reindex_facets(); let necessary_faceted_field = |id: FieldId| -> bool { @@ -880,20 +857,16 @@ impl<'a, 'i> Transform<'a, 'i> { // we need the fields for the prompt/templating. let reindex_vectors = settings_diff.reindex_vectors(); - // The set of additional searchable fields only, - // the only purpose of these fields is to be indexed from scratch. - let mut additional_searchables_only = HashSet::new(); + // The operations that we must perform on the different fields. + let mut operations = HashMap::new(); let mut obkv_writer = KvWriter::<_, FieldId>::memory(); for (id, val) in old_obkv.iter() { - if is_primary_key(id) - || necessary_searchable_field_to_reindex(id) - || necessary_faceted_field(id) - || reindex_vectors - { + if is_primary_key(id) || necessary_faceted_field(id) || reindex_vectors { + operations.insert(id, DelAddOperation::DeletionAndAddition); obkv_writer.insert(id, val)?; - } else if additional_searchable_field_only(id) { - additional_searchables_only.insert(id); + } else if let Some(operation) = settings_diff.reindex_searchable_id(id) { + operations.insert(id, operation); obkv_writer.insert(id, val)?; } } @@ -913,13 +886,7 @@ impl<'a, 'i> Transform<'a, 'i> { flattened_obkv_buffer.clear(); into_del_add_obkv_conditional_operation(flattened, flattened_obkv_buffer, |id| { - // If the field is only required because it is an additional - // searchable field only define it as an DelAdd::Addition only. - if additional_searchables_only.contains(&id) { - DelAddOperation::Addition - } else { - DelAddOperation::DeletionAndAddition - } + operations.get(&id).copied().unwrap_or(DelAddOperation::DeletionAndAddition) })?; } diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index af37a205c..b401adff9 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -9,6 +9,7 @@ use itertools::{EitherOrBoth, Itertools}; use serde::{Deserialize, Deserializer, Serialize, Serializer}; use time::OffsetDateTime; +use super::del_add::DelAddOperation; use super::index_documents::{IndexDocumentsConfig, Transform}; use super::IndexerConfig; use crate::criterion::Criterion; @@ -1112,6 +1113,31 @@ impl InnerIndexSettingsDiff { || self.old.proximity_precision != self.new.proximity_precision } + pub fn reindex_searchable_id(&self, id: FieldId) -> Option { + if self.old.stop_words.as_ref().map(|set| set.as_fst().as_bytes()) + != self.new.stop_words.as_ref().map(|set| set.as_fst().as_bytes()) + || self.old.allowed_separators != self.new.allowed_separators + || self.old.dictionary != self.new.dictionary + || self.old.exact_attributes != self.new.exact_attributes + // Here we can be much more optimal by just deleting the proximity database + || self.old.proximity_precision != self.new.proximity_precision + { + Some(DelAddOperation::DeletionAndAddition) + } else if let Some(only_additional_fields) = self.only_additional_fields() { + let additional_field = self.new.fields_ids_map.name(id).unwrap(); + if only_additional_fields.contains(additional_field) { + Some(DelAddOperation::Addition) + } else { + None + } + } else if self.old.user_defined_searchable_fields != self.new.user_defined_searchable_fields + { + Some(DelAddOperation::DeletionAndAddition) + } else { + None + } + } + /// Returns only the additional searchable fields. /// If any other searchable field has been modified, returns None. pub fn only_additional_fields(&self) -> Option> { @@ -1599,7 +1625,7 @@ mod tests { // When we search for something that is not in // the searchable fields it must not return any document. let result = index.search(&rtxn).query("23").execute().unwrap(); - assert!(result.documents_ids.is_empty()); + assert_eq!(result.documents_ids, Vec::::new()); // When we search for something that is in the searchable fields // we must find the appropriate document. From 1b639ce44b694ec8f61dceca352a5b7c1ef93dae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 30 May 2024 11:17:03 +0200 Subject: [PATCH 016/110] Reduce the number of complex calls to settings diff functions --- milli/src/update/index_documents/transform.rs | 14 ++-- .../src/update/index_documents/typed_chunk.rs | 2 +- milli/src/update/settings.rs | 69 ++++++++++++------- 3 files changed, 53 insertions(+), 32 deletions(-) diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 34e40b7f6..59bab36e8 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -808,13 +808,15 @@ impl<'a, 'i> Transform<'a, 'i> { let mut new_inner_settings = old_inner_settings.clone(); new_inner_settings.fields_ids_map = fields_ids_map; - let settings_diff = InnerIndexSettingsDiff { - old: old_inner_settings, - new: new_inner_settings, + let embedding_configs_updated = false; + let settings_update_only = false; + let settings_diff = InnerIndexSettingsDiff::new( + old_inner_settings, + new_inner_settings, primary_key_id, - embedding_configs_updated: false, - settings_update_only: false, - }; + embedding_configs_updated, + settings_update_only, + ); Ok(TransformOutput { primary_key, diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 4021dea08..2fbe91685 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -489,7 +489,7 @@ pub(crate) fn write_typed_chunk_into_index( } let merger = builder.build(); - if settings_diff.only_additional_fields().is_some() { + if settings_diff.only_additional_fields.is_some() { write_proximity_entries_into_database_additional_searchables( merger, &index.word_pair_proximity_docids, diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index b401adff9..84eccf8e6 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -1073,13 +1073,14 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { .index .primary_key(self.wtxn)? .and_then(|name| new_inner_settings.fields_ids_map.id(name)); - let inner_settings_diff = InnerIndexSettingsDiff { - old: old_inner_settings, - new: new_inner_settings, + let settings_update_only = true; + let inner_settings_diff = InnerIndexSettingsDiff::new( + old_inner_settings, + new_inner_settings, primary_key_id, embedding_configs_updated, - settings_update_only: true, - }; + settings_update_only, + ); if inner_settings_diff.any_reindexing_needed() { self.reindex(&progress_callback, &should_abort, inner_settings_diff)?; @@ -1096,9 +1097,46 @@ pub struct InnerIndexSettingsDiff { // TODO: compare directly the embedders. pub(crate) embedding_configs_updated: bool, pub(crate) settings_update_only: bool, + /// The set of only the additional searchable fields. + /// If any other searchable field has been modified, is set to None. + pub(crate) only_additional_fields: Option>, } impl InnerIndexSettingsDiff { + pub(crate) fn new( + old_settings: InnerIndexSettings, + new_settings: InnerIndexSettings, + primary_key_id: Option, + embedding_configs_updated: bool, + settings_update_only: bool, + ) -> Self { + let only_additional_fields = match ( + &old_settings.user_defined_searchable_fields, + &new_settings.user_defined_searchable_fields, + ) { + (None, None) | (Some(_), None) | (None, Some(_)) => None, // None means * + (Some(old), Some(new)) => { + let old: HashSet<_> = old.iter().cloned().collect(); + let new: HashSet<_> = new.iter().cloned().collect(); + if old.difference(&new).next().is_none() { + // if no field has been removed return only the additional ones + Some(&new - &old) + } else { + None + } + } + }; + + InnerIndexSettingsDiff { + old: old_settings, + new: new_settings, + primary_key_id, + embedding_configs_updated, + settings_update_only, + only_additional_fields, + } + } + pub fn any_reindexing_needed(&self) -> bool { self.reindex_searchable() || self.reindex_facets() || self.reindex_vectors() } @@ -1123,7 +1161,7 @@ impl InnerIndexSettingsDiff { || self.old.proximity_precision != self.new.proximity_precision { Some(DelAddOperation::DeletionAndAddition) - } else if let Some(only_additional_fields) = self.only_additional_fields() { + } else if let Some(only_additional_fields) = &self.only_additional_fields { let additional_field = self.new.fields_ids_map.name(id).unwrap(); if only_additional_fields.contains(additional_field) { Some(DelAddOperation::Addition) @@ -1138,25 +1176,6 @@ impl InnerIndexSettingsDiff { } } - /// Returns only the additional searchable fields. - /// If any other searchable field has been modified, returns None. - pub fn only_additional_fields(&self) -> Option> { - match (&self.old.user_defined_searchable_fields, &self.new.user_defined_searchable_fields) { - (None, None) | (Some(_), None) | (None, Some(_)) => None, // None means * - (Some(old), Some(new)) => { - let old: HashSet<_> = old.iter().cloned().collect(); - let new: HashSet<_> = new.iter().cloned().collect(); - if old.difference(&new).next().is_none() { - // if no field has been removed - // return only the additional ones - Some(&new - &old) - } else { - None - } - } - } - } - pub fn reindex_facets(&self) -> bool { let existing_fields = &self.new.existing_fields; if existing_fields.iter().any(|field| field.contains('.')) { From 091bb157f1876156caa1322ac8a132cfcbc8a1f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 30 May 2024 12:08:27 +0200 Subject: [PATCH 017/110] Add a span for the settings diff creation --- milli/src/update/settings.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 84eccf8e6..eca250c19 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -1103,6 +1103,7 @@ pub struct InnerIndexSettingsDiff { } impl InnerIndexSettingsDiff { + #[tracing::instrument(level = "trace", skip_all, target = "indexing::settings")] pub(crate) fn new( old_settings: InnerIndexSettings, new_settings: InnerIndexSettings, From b81953a65db9f634a853d025e4522333956c4bbd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 30 May 2024 12:14:22 +0200 Subject: [PATCH 018/110] Add a span for the prepare_for_documents_reindexing --- milli/src/update/index_documents/transform.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 59bab36e8..c34b7876a 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -899,6 +899,11 @@ impl<'a, 'i> Transform<'a, 'i> { /// of the index with the attributes reordered accordingly to the `FieldsIdsMap` given as argument. /// // TODO this can be done in parallel by using the rayon `ThreadPool`. + #[tracing::instrument( + level = "trace" + skip(self, wtxn, settings_diff), + target = "indexing::documents" + )] pub fn prepare_for_documents_reindexing( self, wtxn: &mut heed::RwTxn<'i>, From a998b881f6ea46f6a0d4ad47727c7155ddfc2699 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 30 May 2024 15:39:05 +0200 Subject: [PATCH 019/110] Cache a lot of operations to know if a field must be indexed --- milli/src/update/settings.rs | 40 +++++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 17 deletions(-) diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index eca250c19..11a249068 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -1100,6 +1100,12 @@ pub struct InnerIndexSettingsDiff { /// The set of only the additional searchable fields. /// If any other searchable field has been modified, is set to None. pub(crate) only_additional_fields: Option>, + + // Cache the check to see if all the stop_words, allowed_separators, dictionary, + // exact_attributes, proximity_precision are different. + pub(crate) cache_reindex_searchable_without_user_defined: bool, + // Cache the check to see if all the user_defined_searchables are different. + pub(crate) cache_user_defined_searchables: bool, } impl InnerIndexSettingsDiff { @@ -1128,6 +1134,18 @@ impl InnerIndexSettingsDiff { } }; + let cache_reindex_searchable_without_user_defined = { + old_settings.stop_words.as_ref().map(|set| set.as_fst().as_bytes()) + != new_settings.stop_words.as_ref().map(|set| set.as_fst().as_bytes()) + || old_settings.allowed_separators != new_settings.allowed_separators + || old_settings.dictionary != new_settings.dictionary + || old_settings.exact_attributes != new_settings.exact_attributes + || old_settings.proximity_precision != new_settings.proximity_precision + }; + + let cache_user_defined_searchables = old_settings.user_defined_searchable_fields + != new_settings.user_defined_searchable_fields; + InnerIndexSettingsDiff { old: old_settings, new: new_settings, @@ -1135,6 +1153,8 @@ impl InnerIndexSettingsDiff { embedding_configs_updated, settings_update_only, only_additional_fields, + cache_reindex_searchable_without_user_defined, + cache_user_defined_searchables, } } @@ -1143,24 +1163,11 @@ impl InnerIndexSettingsDiff { } pub fn reindex_searchable(&self) -> bool { - self.old.stop_words.as_ref().map(|set| set.as_fst().as_bytes()) - != self.new.stop_words.as_ref().map(|set| set.as_fst().as_bytes()) - || self.old.allowed_separators != self.new.allowed_separators - || self.old.dictionary != self.new.dictionary - || self.old.user_defined_searchable_fields != self.new.user_defined_searchable_fields - || self.old.exact_attributes != self.new.exact_attributes - || self.old.proximity_precision != self.new.proximity_precision + self.cache_reindex_searchable_without_user_defined || self.cache_user_defined_searchables } pub fn reindex_searchable_id(&self, id: FieldId) -> Option { - if self.old.stop_words.as_ref().map(|set| set.as_fst().as_bytes()) - != self.new.stop_words.as_ref().map(|set| set.as_fst().as_bytes()) - || self.old.allowed_separators != self.new.allowed_separators - || self.old.dictionary != self.new.dictionary - || self.old.exact_attributes != self.new.exact_attributes - // Here we can be much more optimal by just deleting the proximity database - || self.old.proximity_precision != self.new.proximity_precision - { + if self.cache_reindex_searchable_without_user_defined { Some(DelAddOperation::DeletionAndAddition) } else if let Some(only_additional_fields) = &self.only_additional_fields { let additional_field = self.new.fields_ids_map.name(id).unwrap(); @@ -1169,8 +1176,7 @@ impl InnerIndexSettingsDiff { } else { None } - } else if self.old.user_defined_searchable_fields != self.new.user_defined_searchable_fields - { + } else if self.cache_user_defined_searchables { Some(DelAddOperation::DeletionAndAddition) } else { None From 2af7e4dbe9f7e3af3aa678ad21969b25fb91aac6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 30 May 2024 16:46:57 +0200 Subject: [PATCH 020/110] Rename the embeddings workloads --- ...subset-hf-embeddings.json => embeddings-movies-subset-hf.json} | 0 ...{settings-add-embeddings.json => embeddings-settings-add.json} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename workloads/{movies-subset-hf-embeddings.json => embeddings-movies-subset-hf.json} (100%) rename workloads/{settings-add-embeddings.json => embeddings-settings-add.json} (100%) diff --git a/workloads/movies-subset-hf-embeddings.json b/workloads/embeddings-movies-subset-hf.json similarity index 100% rename from workloads/movies-subset-hf-embeddings.json rename to workloads/embeddings-movies-subset-hf.json diff --git a/workloads/settings-add-embeddings.json b/workloads/embeddings-settings-add.json similarity index 100% rename from workloads/settings-add-embeddings.json rename to workloads/embeddings-settings-add.json From 5cd08979b189ef8e41c3da1e2e56eb2a45c487eb Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 3 Jun 2024 11:42:57 +0200 Subject: [PATCH 021/110] iterate over the faceted fields instead of over the whole document --- .../extract/extract_fid_docid_facet_values.rs | 64 +++++++++++++------ 1 file changed, 46 insertions(+), 18 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs index 3cbd7e49e..bcbb87a58 100644 --- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs +++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs @@ -1,5 +1,5 @@ use std::borrow::Cow; -use std::collections::BTreeMap; +use std::collections::{BTreeMap, BTreeSet}; use std::convert::TryInto; use std::fs::File; use std::io::{self, BufReader}; @@ -9,7 +9,7 @@ use std::result::Result as StdResult; use bytemuck::bytes_of; use grenad::Sorter; use heed::BytesEncode; -use itertools::EitherOrBoth; +use itertools::{merge_join_by, EitherOrBoth}; use ordered_float::OrderedFloat; use roaring::RoaringBitmap; use serde_json::{from_slice, Value}; @@ -18,7 +18,7 @@ use FilterableValues::{Empty, Null, Values}; use super::helpers::{create_sorter, keep_first, sorter_into_reader, GrenadParameters}; use crate::error::InternalError; use crate::facet::value_encoding::f64_into_bytes; -use crate::update::del_add::{DelAdd, KvWriterDelAdd}; +use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; use crate::update::index_documents::{create_writer, writer_into_reader}; use crate::update::settings::InnerIndexSettingsDiff; use crate::{CboRoaringBitmapCodec, DocumentId, Error, FieldId, Result, MAX_FACET_VALUE_LENGTH}; @@ -66,6 +66,11 @@ pub fn extract_fid_docid_facet_values( max_memory.map(|m| m / 2), ); + let old_faceted_fids: BTreeSet<_> = + settings_diff.old.faceted_fields_ids.iter().copied().collect(); + let new_faceted_fids: BTreeSet<_> = + settings_diff.new.faceted_fields_ids.iter().copied().collect(); + // The tuples represents the Del and Add side for a bitmap let mut facet_exists_docids = BTreeMap::::new(); let mut facet_is_null_docids = BTreeMap::::new(); @@ -78,11 +83,45 @@ pub fn extract_fid_docid_facet_values( let mut cursor = obkv_documents.into_cursor()?; while let Some((docid_bytes, value)) = cursor.move_on_next()? { let obkv = obkv::KvReader::new(value); + let get_document_json_value = move |field_id, side| { + obkv.get(field_id) + .map(KvReaderDelAdd::new) + .and_then(|kv| kv.get(side)) + .map(from_slice) + .transpose() + .map_err(InternalError::SerdeJson) + }; + // iterate over the faceted fields instead of over the whole document. + for eob in + merge_join_by(old_faceted_fids.iter(), new_faceted_fids.iter(), |old, new| old.cmp(new)) + { + let (field_id, del_value, add_value) = match eob { + EitherOrBoth::Left(&field_id) => { + let del_value = get_document_json_value(field_id, DelAdd::Deletion)?; - for (field_id, field_bytes) in obkv.iter() { - let delete_faceted = settings_diff.old.faceted_fields_ids.contains(&field_id); - let add_faceted = settings_diff.new.faceted_fields_ids.contains(&field_id); - if delete_faceted || add_faceted { + // deletion only + (field_id, del_value, None) + } + EitherOrBoth::Right(&field_id) => { + let add_value = get_document_json_value(field_id, DelAdd::Addition)?; + + // addition only + (field_id, None, add_value) + } + EitherOrBoth::Both(&field_id, _) => { + // during settings update, recompute the changing settings only. + if settings_diff.settings_update_only { + continue; + } + + let del_value = get_document_json_value(field_id, DelAdd::Deletion)?; + let add_value = get_document_json_value(field_id, DelAdd::Addition)?; + + (field_id, del_value, add_value) + } + }; + + if del_value.is_some() || add_value.is_some() { numbers_key_buffer.clear(); strings_key_buffer.clear(); @@ -98,17 +137,6 @@ pub fn extract_fid_docid_facet_values( numbers_key_buffer.extend_from_slice(docid_bytes); strings_key_buffer.extend_from_slice(docid_bytes); - let del_add_obkv = obkv::KvReader::new(field_bytes); - let del_value = match del_add_obkv.get(DelAdd::Deletion).filter(|_| delete_faceted) - { - Some(bytes) => Some(from_slice(bytes).map_err(InternalError::SerdeJson)?), - None => None, - }; - let add_value = match del_add_obkv.get(DelAdd::Addition).filter(|_| add_faceted) { - Some(bytes) => Some(from_slice(bytes).map_err(InternalError::SerdeJson)?), - None => None, - }; - // We insert the document id on the Del and the Add side if the field exists. let (ref mut del_exists, ref mut add_exists) = facet_exists_docids.entry(field_id).or_default(); From 261e92d7e6511ea212c5440fc85522e9f40fbfb7 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 4 Jun 2024 15:31:24 +0200 Subject: [PATCH 022/110] Skip iterating over documents when the faceted field list doesn't change --- .../extract/extract_fid_docid_facet_values.rs | 318 +++++++++--------- 1 file changed, 161 insertions(+), 157 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs index bcbb87a58..810fa26a9 100644 --- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs +++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs @@ -66,11 +66,6 @@ pub fn extract_fid_docid_facet_values( max_memory.map(|m| m / 2), ); - let old_faceted_fids: BTreeSet<_> = - settings_diff.old.faceted_fields_ids.iter().copied().collect(); - let new_faceted_fids: BTreeSet<_> = - settings_diff.new.faceted_fields_ids.iter().copied().collect(); - // The tuples represents the Del and Add side for a bitmap let mut facet_exists_docids = BTreeMap::::new(); let mut facet_is_null_docids = BTreeMap::::new(); @@ -80,172 +75,181 @@ pub fn extract_fid_docid_facet_values( let mut numbers_key_buffer = Vec::new(); let mut strings_key_buffer = Vec::new(); - let mut cursor = obkv_documents.into_cursor()?; - while let Some((docid_bytes, value)) = cursor.move_on_next()? { - let obkv = obkv::KvReader::new(value); - let get_document_json_value = move |field_id, side| { - obkv.get(field_id) - .map(KvReaderDelAdd::new) - .and_then(|kv| kv.get(side)) - .map(from_slice) - .transpose() - .map_err(InternalError::SerdeJson) - }; - // iterate over the faceted fields instead of over the whole document. - for eob in - merge_join_by(old_faceted_fids.iter(), new_faceted_fids.iter(), |old, new| old.cmp(new)) - { - let (field_id, del_value, add_value) = match eob { - EitherOrBoth::Left(&field_id) => { - let del_value = get_document_json_value(field_id, DelAdd::Deletion)?; + let old_faceted_fids: BTreeSet<_> = + settings_diff.old.faceted_fields_ids.iter().copied().collect(); + let new_faceted_fids: BTreeSet<_> = + settings_diff.new.faceted_fields_ids.iter().copied().collect(); - // deletion only - (field_id, del_value, None) - } - EitherOrBoth::Right(&field_id) => { - let add_value = get_document_json_value(field_id, DelAdd::Addition)?; + if !settings_diff.settings_update_only || old_faceted_fids != new_faceted_fids { + let mut cursor = obkv_documents.into_cursor()?; + while let Some((docid_bytes, value)) = cursor.move_on_next()? { + let obkv = obkv::KvReader::new(value); + let get_document_json_value = move |field_id, side| { + obkv.get(field_id) + .map(KvReaderDelAdd::new) + .and_then(|kv| kv.get(side)) + .map(from_slice) + .transpose() + .map_err(InternalError::SerdeJson) + }; + // iterate over the faceted fields instead of over the whole document. + for eob in + merge_join_by(old_faceted_fids.iter(), new_faceted_fids.iter(), |old, new| { + old.cmp(new) + }) + { + let (field_id, del_value, add_value) = match eob { + EitherOrBoth::Left(&field_id) => { + let del_value = get_document_json_value(field_id, DelAdd::Deletion)?; - // addition only - (field_id, None, add_value) - } - EitherOrBoth::Both(&field_id, _) => { - // during settings update, recompute the changing settings only. - if settings_diff.settings_update_only { - continue; + // deletion only + (field_id, del_value, None) + } + EitherOrBoth::Right(&field_id) => { + let add_value = get_document_json_value(field_id, DelAdd::Addition)?; + + // addition only + (field_id, None, add_value) + } + EitherOrBoth::Both(&field_id, _) => { + // during settings update, recompute the changing settings only. + if settings_diff.settings_update_only { + continue; + } + + let del_value = get_document_json_value(field_id, DelAdd::Deletion)?; + let add_value = get_document_json_value(field_id, DelAdd::Addition)?; + + (field_id, del_value, add_value) + } + }; + + if del_value.is_some() || add_value.is_some() { + numbers_key_buffer.clear(); + strings_key_buffer.clear(); + + // Set key to the field_id + // Note: this encoding is consistent with FieldIdCodec + numbers_key_buffer.extend_from_slice(&field_id.to_be_bytes()); + strings_key_buffer.extend_from_slice(&field_id.to_be_bytes()); + + let document: [u8; 4] = docid_bytes[..4].try_into().ok().unwrap(); + let document = DocumentId::from_be_bytes(document); + + // For the other extraction tasks, prefix the key with the field_id and the document_id + numbers_key_buffer.extend_from_slice(docid_bytes); + strings_key_buffer.extend_from_slice(docid_bytes); + + // We insert the document id on the Del and the Add side if the field exists. + let (ref mut del_exists, ref mut add_exists) = + facet_exists_docids.entry(field_id).or_default(); + let (ref mut del_is_null, ref mut add_is_null) = + facet_is_null_docids.entry(field_id).or_default(); + let (ref mut del_is_empty, ref mut add_is_empty) = + facet_is_empty_docids.entry(field_id).or_default(); + + if del_value.is_some() { + del_exists.insert(document); + } + if add_value.is_some() { + add_exists.insert(document); } - let del_value = get_document_json_value(field_id, DelAdd::Deletion)?; - let add_value = get_document_json_value(field_id, DelAdd::Addition)?; + let del_geo_support = settings_diff + .old + .geo_fields_ids + .map_or(false, |(lat, lng)| field_id == lat || field_id == lng); + let add_geo_support = settings_diff + .new + .geo_fields_ids + .map_or(false, |(lat, lng)| field_id == lat || field_id == lng); + let del_filterable_values = + del_value.map(|value| extract_facet_values(&value, del_geo_support)); + let add_filterable_values = + add_value.map(|value| extract_facet_values(&value, add_geo_support)); - (field_id, del_value, add_value) - } - }; + // Those closures are just here to simplify things a bit. + let mut insert_numbers_diff = |del_numbers, add_numbers| { + insert_numbers_diff( + &mut fid_docid_facet_numbers_sorter, + &mut numbers_key_buffer, + del_numbers, + add_numbers, + ) + }; + let mut insert_strings_diff = |del_strings, add_strings| { + insert_strings_diff( + &mut fid_docid_facet_strings_sorter, + &mut strings_key_buffer, + del_strings, + add_strings, + ) + }; - if del_value.is_some() || add_value.is_some() { - numbers_key_buffer.clear(); - strings_key_buffer.clear(); - - // Set key to the field_id - // Note: this encoding is consistent with FieldIdCodec - numbers_key_buffer.extend_from_slice(&field_id.to_be_bytes()); - strings_key_buffer.extend_from_slice(&field_id.to_be_bytes()); - - let document: [u8; 4] = docid_bytes[..4].try_into().ok().unwrap(); - let document = DocumentId::from_be_bytes(document); - - // For the other extraction tasks, prefix the key with the field_id and the document_id - numbers_key_buffer.extend_from_slice(docid_bytes); - strings_key_buffer.extend_from_slice(docid_bytes); - - // We insert the document id on the Del and the Add side if the field exists. - let (ref mut del_exists, ref mut add_exists) = - facet_exists_docids.entry(field_id).or_default(); - let (ref mut del_is_null, ref mut add_is_null) = - facet_is_null_docids.entry(field_id).or_default(); - let (ref mut del_is_empty, ref mut add_is_empty) = - facet_is_empty_docids.entry(field_id).or_default(); - - if del_value.is_some() { - del_exists.insert(document); - } - if add_value.is_some() { - add_exists.insert(document); - } - - let del_geo_support = settings_diff - .old - .geo_fields_ids - .map_or(false, |(lat, lng)| field_id == lat || field_id == lng); - let add_geo_support = settings_diff - .new - .geo_fields_ids - .map_or(false, |(lat, lng)| field_id == lat || field_id == lng); - let del_filterable_values = - del_value.map(|value| extract_facet_values(&value, del_geo_support)); - let add_filterable_values = - add_value.map(|value| extract_facet_values(&value, add_geo_support)); - - // Those closures are just here to simplify things a bit. - let mut insert_numbers_diff = |del_numbers, add_numbers| { - insert_numbers_diff( - &mut fid_docid_facet_numbers_sorter, - &mut numbers_key_buffer, - del_numbers, - add_numbers, - ) - }; - let mut insert_strings_diff = |del_strings, add_strings| { - insert_strings_diff( - &mut fid_docid_facet_strings_sorter, - &mut strings_key_buffer, - del_strings, - add_strings, - ) - }; - - match (del_filterable_values, add_filterable_values) { - (None, None) => (), - (Some(del_filterable_values), None) => match del_filterable_values { - Null => { - del_is_null.insert(document); - } - Empty => { - del_is_empty.insert(document); - } - Values { numbers, strings } => { - insert_numbers_diff(numbers, vec![])?; - insert_strings_diff(strings, vec![])?; - } - }, - (None, Some(add_filterable_values)) => match add_filterable_values { - Null => { - add_is_null.insert(document); - } - Empty => { - add_is_empty.insert(document); - } - Values { numbers, strings } => { - insert_numbers_diff(vec![], numbers)?; - insert_strings_diff(vec![], strings)?; - } - }, - (Some(del_filterable_values), Some(add_filterable_values)) => { - match (del_filterable_values, add_filterable_values) { - (Null, Null) | (Empty, Empty) => (), - (Null, Empty) => { - del_is_null.insert(document); - add_is_empty.insert(document); - } - (Empty, Null) => { - del_is_empty.insert(document); - add_is_null.insert(document); - } - (Null, Values { numbers, strings }) => { - insert_numbers_diff(vec![], numbers)?; - insert_strings_diff(vec![], strings)?; + match (del_filterable_values, add_filterable_values) { + (None, None) => (), + (Some(del_filterable_values), None) => match del_filterable_values { + Null => { del_is_null.insert(document); } - (Empty, Values { numbers, strings }) => { - insert_numbers_diff(vec![], numbers)?; - insert_strings_diff(vec![], strings)?; + Empty => { del_is_empty.insert(document); } - (Values { numbers, strings }, Null) => { - add_is_null.insert(document); + Values { numbers, strings } => { insert_numbers_diff(numbers, vec![])?; insert_strings_diff(strings, vec![])?; } - (Values { numbers, strings }, Empty) => { - add_is_empty.insert(document); - insert_numbers_diff(numbers, vec![])?; - insert_strings_diff(strings, vec![])?; + }, + (None, Some(add_filterable_values)) => match add_filterable_values { + Null => { + add_is_null.insert(document); } - ( - Values { numbers: del_numbers, strings: del_strings }, - Values { numbers: add_numbers, strings: add_strings }, - ) => { - insert_numbers_diff(del_numbers, add_numbers)?; - insert_strings_diff(del_strings, add_strings)?; + Empty => { + add_is_empty.insert(document); + } + Values { numbers, strings } => { + insert_numbers_diff(vec![], numbers)?; + insert_strings_diff(vec![], strings)?; + } + }, + (Some(del_filterable_values), Some(add_filterable_values)) => { + match (del_filterable_values, add_filterable_values) { + (Null, Null) | (Empty, Empty) => (), + (Null, Empty) => { + del_is_null.insert(document); + add_is_empty.insert(document); + } + (Empty, Null) => { + del_is_empty.insert(document); + add_is_null.insert(document); + } + (Null, Values { numbers, strings }) => { + insert_numbers_diff(vec![], numbers)?; + insert_strings_diff(vec![], strings)?; + del_is_null.insert(document); + } + (Empty, Values { numbers, strings }) => { + insert_numbers_diff(vec![], numbers)?; + insert_strings_diff(vec![], strings)?; + del_is_empty.insert(document); + } + (Values { numbers, strings }, Null) => { + add_is_null.insert(document); + insert_numbers_diff(numbers, vec![])?; + insert_strings_diff(strings, vec![])?; + } + (Values { numbers, strings }, Empty) => { + add_is_empty.insert(document); + insert_numbers_diff(numbers, vec![])?; + insert_strings_diff(strings, vec![])?; + } + ( + Values { numbers: del_numbers, strings: del_strings }, + Values { numbers: add_numbers, strings: add_strings }, + ) => { + insert_numbers_diff(del_numbers, add_numbers)?; + insert_strings_diff(del_strings, add_strings)?; + } } } } From 0a4118329eafc4fca2055b368f85b5a4b35054f3 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 5 Jun 2024 10:51:16 +0200 Subject: [PATCH 023/110] Put only_additional_fields to None if the difference gives an empty result. --- milli/src/update/settings.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 11a249068..952b017c6 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -1127,7 +1127,7 @@ impl InnerIndexSettingsDiff { let new: HashSet<_> = new.iter().cloned().collect(); if old.difference(&new).next().is_none() { // if no field has been removed return only the additional ones - Some(&new - &old) + Some(&new - &old).filter(|x| !x.is_empty()) } else { None } From b833be46b9b1076d3dc2b26d7a1e197e55e887d7 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 5 Jun 2024 12:48:44 +0200 Subject: [PATCH 024/110] Avoid running proximity when only the exact attributes changes --- .../extract_word_pair_proximity_docids.rs | 9 ++++---- milli/src/update/settings.rs | 22 +++++++++++++++---- 2 files changed, 22 insertions(+), 9 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs index 617338f9f..5a9363942 100644 --- a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs @@ -26,11 +26,8 @@ pub fn extract_word_pair_proximity_docids( indexer: GrenadParameters, settings_diff: &InnerIndexSettingsDiff, ) -> Result>> { - let any_deletion = settings_diff.old.proximity_precision == ProximityPrecision::ByWord; - let any_addition = settings_diff.new.proximity_precision == ProximityPrecision::ByWord; - // early return if the data shouldn't be deleted nor created. - if !any_deletion && !any_addition { + if settings_diff.settings_update_only && !settings_diff.reindex_proximities() { let writer = create_writer( indexer.chunk_compression_type, indexer.chunk_compression_level, @@ -39,8 +36,10 @@ pub fn extract_word_pair_proximity_docids( return writer_into_reader(writer); } - let max_memory = indexer.max_memory_by_thread(); + let any_deletion = settings_diff.old.proximity_precision == ProximityPrecision::ByWord; + let any_addition = settings_diff.new.proximity_precision == ProximityPrecision::ByWord; + let max_memory = indexer.max_memory_by_thread(); let mut word_pair_proximity_docids_sorters: Vec<_> = (1..MAX_DISTANCE) .map(|_| { create_sorter( diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 952b017c6..dc26ac746 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -1104,8 +1104,10 @@ pub struct InnerIndexSettingsDiff { // Cache the check to see if all the stop_words, allowed_separators, dictionary, // exact_attributes, proximity_precision are different. pub(crate) cache_reindex_searchable_without_user_defined: bool, - // Cache the check to see if all the user_defined_searchables are different. + // Cache the check to see if the user_defined_searchables are different. pub(crate) cache_user_defined_searchables: bool, + // Cache the check to see if the exact_attributes are different. + pub(crate) cache_exact_attributes: bool, } impl InnerIndexSettingsDiff { @@ -1139,10 +1141,11 @@ impl InnerIndexSettingsDiff { != new_settings.stop_words.as_ref().map(|set| set.as_fst().as_bytes()) || old_settings.allowed_separators != new_settings.allowed_separators || old_settings.dictionary != new_settings.dictionary - || old_settings.exact_attributes != new_settings.exact_attributes || old_settings.proximity_precision != new_settings.proximity_precision }; + let cache_exact_attributes = old_settings.exact_attributes != new_settings.exact_attributes; + let cache_user_defined_searchables = old_settings.user_defined_searchable_fields != new_settings.user_defined_searchable_fields; @@ -1155,6 +1158,7 @@ impl InnerIndexSettingsDiff { only_additional_fields, cache_reindex_searchable_without_user_defined, cache_user_defined_searchables, + cache_exact_attributes, } } @@ -1163,11 +1167,21 @@ impl InnerIndexSettingsDiff { } pub fn reindex_searchable(&self) -> bool { - self.cache_reindex_searchable_without_user_defined || self.cache_user_defined_searchables + self.cache_reindex_searchable_without_user_defined + || self.cache_exact_attributes + || self.cache_user_defined_searchables + } + + pub fn reindex_proximities(&self) -> bool { + // if any searchable settings force the reindexing + (self.cache_reindex_searchable_without_user_defined || self.cache_user_defined_searchables) + // and if any settings needs the proximity database created + && (self.old.proximity_precision == ProximityPrecision::ByAttribute + || self.old.proximity_precision == ProximityPrecision::ByAttribute) } pub fn reindex_searchable_id(&self, id: FieldId) -> Option { - if self.cache_reindex_searchable_without_user_defined { + if self.cache_reindex_searchable_without_user_defined || self.cache_exact_attributes { Some(DelAddOperation::DeletionAndAddition) } else if let Some(only_additional_fields) = &self.only_additional_fields { let additional_field = self.new.fields_ids_map.name(id).unwrap(); From 30293883e03670a0258d35b71fa2be508e5ee985 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 5 Jun 2024 16:00:24 +0200 Subject: [PATCH 025/110] Fix condition mistake --- milli/src/update/settings.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index dc26ac746..be9b6b74e 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -1177,7 +1177,7 @@ impl InnerIndexSettingsDiff { (self.cache_reindex_searchable_without_user_defined || self.cache_user_defined_searchables) // and if any settings needs the proximity database created && (self.old.proximity_precision == ProximityPrecision::ByAttribute - || self.old.proximity_precision == ProximityPrecision::ByAttribute) + || self.new.proximity_precision == ProximityPrecision::ByAttribute) } pub fn reindex_searchable_id(&self, id: FieldId) -> Option { From 2e50c6ec81b05cf3e20c5457c64e37f8579eccd5 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 6 Jun 2024 10:18:43 +0200 Subject: [PATCH 026/110] Update Charabia --- Cargo.lock | 248 ++++++++++++++++++++++------------------------- milli/Cargo.toml | 2 +- 2 files changed, 118 insertions(+), 132 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 96c119a19..b62a61f92 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -898,9 +898,9 @@ dependencies = [ [[package]] name = "charabia" -version = "0.8.10" +version = "0.8.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "933f20f2269b24d32fd5503e7b3c268af902190daf8d9d2b73ed2e75d77c00b4" +checksum = "11a09ae38cfcc153f01576c3f579dfd916e0320f1b474f298c8d680b2dd92eb6" dependencies = [ "aho-corasick", "cow-utils", @@ -989,7 +989,7 @@ dependencies = [ "anstream", "anstyle", "clap_lex", - "strsim", + "strsim 0.10.0", ] [[package]] @@ -1280,12 +1280,12 @@ dependencies = [ [[package]] name = "darling" -version = "0.20.3" +version = "0.20.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0209d94da627ab5605dcccf08bb18afa5009cfbef48d8a8b7d7bdbc79be25c5e" +checksum = "83b2eb4d90d12bdda5ed17de686c2acb4c57914f8f921b8da7e112b5a36f3fe1" dependencies = [ - "darling_core 0.20.3", - "darling_macro 0.20.3", + "darling_core 0.20.9", + "darling_macro 0.20.9", ] [[package]] @@ -1298,21 +1298,21 @@ dependencies = [ "ident_case", "proc-macro2", "quote", - "strsim", + "strsim 0.10.0", "syn 1.0.109", ] [[package]] name = "darling_core" -version = "0.20.3" +version = "0.20.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "177e3443818124b357d8e76f53be906d60937f0d3a90773a664fa63fa253e621" +checksum = "622687fe0bac72a04e5599029151f5796111b90f1baaa9b544d807a5e31cd120" dependencies = [ "fnv", "ident_case", "proc-macro2", "quote", - "strsim", + "strsim 0.11.1", "syn 2.0.60", ] @@ -1329,11 +1329,11 @@ dependencies = [ [[package]] name = "darling_macro" -version = "0.20.3" +version = "0.20.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "836a9bbc7ad63342d6d6e7b815ccab164bc77a2d95d84bc3117a8c0d5c98e2d5" +checksum = "733cabb43482b1a1b53eee8583c2b9e8684d592215ea83efd305dd31bc2f0178" dependencies = [ - "darling_core 0.20.3", + "darling_core 0.20.9", "quote", "syn 2.0.60", ] @@ -1386,6 +1386,15 @@ dependencies = [ "derive_builder_macro 0.13.1", ] +[[package]] +name = "derive_builder" +version = "0.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0350b5cb0331628a5916d6c5c0b72e97393b8b6b03b47a9284f4e7f5a405ffd7" +dependencies = [ + "derive_builder_macro 0.20.0", +] + [[package]] name = "derive_builder_core" version = "0.12.0" @@ -1410,6 +1419,18 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "derive_builder_core" +version = "0.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d48cda787f839151732d396ac69e3473923d54312c070ee21e9effcaa8ca0b1d" +dependencies = [ + "darling 0.20.9", + "proc-macro2", + "quote", + "syn 2.0.60", +] + [[package]] name = "derive_builder_macro" version = "0.12.0" @@ -1430,6 +1451,16 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "derive_builder_macro" +version = "0.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "206868b8242f27cecce124c19fd88157fbd0dd334df2587f36417bafbc85097b" +dependencies = [ + "derive_builder_core 0.20.0", + "syn 2.0.60", +] + [[package]] name = "derive_more" version = "0.99.17" @@ -1457,7 +1488,7 @@ dependencies = [ "serde-cs", "serde_json", "serde_urlencoded", - "strsim", + "strsim 0.10.0", ] [[package]] @@ -1710,29 +1741,6 @@ dependencies = [ "syn 2.0.60", ] -[[package]] -name = "env_filter" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a009aa4810eb158359dda09d0c87378e4bbb89b5a801f016885a4707ba24f7ea" -dependencies = [ - "log", - "regex", -] - -[[package]] -name = "env_logger" -version = "0.11.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38b35839ba51819680ba087cd351788c9a3c476841207e0b8cee0b04722343b9" -dependencies = [ - "anstream", - "anstyle", - "env_filter", - "humantime", - "log", -] - [[package]] name = "equivalent" version = "1.0.1" @@ -1787,7 +1795,7 @@ version = "0.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d15473d7f83b54a44826907af16ae5727eaacaf6e53b51474016d3efd9aa35d5" dependencies = [ - "darling 0.20.3", + "darling 0.20.9", "proc-macro2", "quote", "syn 2.0.60", @@ -2382,12 +2390,6 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c4a1e36c821dbe04574f602848a19f742f4fb3c98d40449f11bcad18d6b17421" -[[package]] -name = "humantime" -version = "2.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" - [[package]] name = "hyper" version = "0.14.27" @@ -2781,9 +2783,9 @@ dependencies = [ [[package]] name = "lindera" -version = "0.30.0" +version = "0.31.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1bbf252ea3490053dc397539ece0b510924f2f72605fa28d3e858d86f43ec88" +checksum = "dcd4fa369654517f72c10b24adf03ad4ce69d19facb79c3cb3cf9b4580ac352f" dependencies = [ "lindera-analyzer", "lindera-core", @@ -2794,9 +2796,9 @@ dependencies = [ [[package]] name = "lindera-analyzer" -version = "0.30.0" +version = "0.31.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87febfec0e2859ce2154fb90dd6f66b774ddb0b6e264b44f8e3d1303c9dcedd7" +checksum = "c2cba7fe275cb8ec4c594cfee9cc39e48b71e02a089457d52f3e70dc146a8133" dependencies = [ "anyhow", "bincode", @@ -2824,9 +2826,9 @@ dependencies = [ [[package]] name = "lindera-cc-cedict" -version = "0.30.0" +version = "0.31.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fcb91bb8a93ab0f95dbc3c43b5105354bb059134ef731154f75a64b5d919e71d" +checksum = "240adf9faba3f09ad16557aefcd316dd00ebb940ac94334a629660d772f118c1" dependencies = [ "bincode", "byteorder", @@ -2838,29 +2840,21 @@ dependencies = [ [[package]] name = "lindera-cc-cedict-builder" -version = "0.30.0" +version = "0.31.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f6022a8309a287dbef425fd09a61585351670c83001d74f6c089979e2330b683" +checksum = "f12241f9e74babe708a0b9441d9f3fa67cb29fd01257918f30ffd480ca568820" dependencies = [ "anyhow", - "bincode", - "byteorder", - "csv", - "encoding", - "env_logger", - "glob", - "lindera-compress", "lindera-core", "lindera-decompress", - "log", - "yada", + "lindera-dictionary-builder", ] [[package]] name = "lindera-compress" -version = "0.30.0" +version = "0.31.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32363cbcf433f915e7d77c2a0c410db2d6b23442e80715cf2cf6b9864078a500" +checksum = "50f9f7a858d70ff9e4383cbd507ca9e98c8faf0319e08c10df4c30cb58c9ca6c" dependencies = [ "anyhow", "flate2", @@ -2869,9 +2863,9 @@ dependencies = [ [[package]] name = "lindera-core" -version = "0.30.0" +version = "0.31.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d9a0e858753a02b1a3524fae4fbb11ca4b3a947128fd7854b797386562678be8" +checksum = "7f09810ab98ce2a084d788ac38fbb7b31697f34bc47c61de0d880320a674bd15" dependencies = [ "anyhow", "bincode", @@ -2886,9 +2880,9 @@ dependencies = [ [[package]] name = "lindera-decompress" -version = "0.30.0" +version = "0.31.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0e406345f6f8b665b9a129c67079c18ca9d97e9d171d102b4106a64a592c285e" +checksum = "d53400c9b2dd6b45f82d9fa5b5efe079f3acaf6ce609dba8d42c8a76baaa2b12" dependencies = [ "anyhow", "flate2", @@ -2897,9 +2891,9 @@ dependencies = [ [[package]] name = "lindera-dictionary" -version = "0.30.0" +version = "0.31.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3e2a3ec0e5fd6768a27c6ec1040e8470d3a5926418f7afe065859e98aabb3bfe" +checksum = "2053d064a515839250438b8dfa6cf445e2b97633232ded34a54f267e945d196e" dependencies = [ "anyhow", "bincode", @@ -2921,10 +2915,32 @@ dependencies = [ ] [[package]] -name = "lindera-filter" -version = "0.30.0" +name = "lindera-dictionary-builder" +version = "0.31.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1badaf51bad051185ea4917ba91bbbf2d6f8167e155647e21e0eaaef0982a95d" +checksum = "14f486924055f8bedcc5877572e4dc91fbc10370862430ac2e5f7f0d671a18c8" +dependencies = [ + "anyhow", + "bincode", + "byteorder", + "csv", + "derive_builder 0.20.0", + "encoding", + "encoding_rs", + "encoding_rs_io", + "glob", + "lindera-compress", + "lindera-core", + "lindera-decompress", + "log", + "yada", +] + +[[package]] +name = "lindera-filter" +version = "0.31.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb3904fc279f0297f6fd6210435adab1f8c82ba84eba8635407c791af51c0d8a" dependencies = [ "anyhow", "csv", @@ -2947,9 +2963,9 @@ dependencies = [ [[package]] name = "lindera-ipadic" -version = "0.30.0" +version = "0.31.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "129ec16366354998f9791467ad38731539197747f649e573ead845358271ce25" +checksum = "4aa3ef2f1f6838b0fa2e2fca2896242bb83bc877c1760cdb6fa23449ab95d664" dependencies = [ "bincode", "byteorder", @@ -2961,31 +2977,21 @@ dependencies = [ [[package]] name = "lindera-ipadic-builder" -version = "0.30.0" +version = "0.31.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f0979a56bc57e9c9be2996dff232c47aa146a2e7baebf5dd567e388eba3dd90" +checksum = "a41287db18eadb58d73a04d49778d41c161549fbbbe155d4338976b7b8541c7d" dependencies = [ "anyhow", - "bincode", - "byteorder", - "csv", - "encoding_rs", - "encoding_rs_io", - "env_logger", - "glob", - "lindera-compress", "lindera-core", "lindera-decompress", - "log", - "serde", - "yada", + "lindera-dictionary-builder", ] [[package]] name = "lindera-ipadic-neologd" -version = "0.30.0" +version = "0.31.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "20076660c4e79ef0316735b44e18ec7644e54786acdee8946c972d5f97086d0f" +checksum = "49382256f245078400bf7e72663f9eb30afcd9ed54cd46f29d7db1be529678e1" dependencies = [ "bincode", "byteorder", @@ -2997,31 +3003,21 @@ dependencies = [ [[package]] name = "lindera-ipadic-neologd-builder" -version = "0.30.0" +version = "0.31.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eccd18ed5f65d1d64ac0cbfa1d6827bfbbaf6530520ae6847e6a91ee38f47e20" +checksum = "5ae9cfd2fda68ef526ef0c7b50c5d4d5582a4daa6ecd0cea9e2b0b62564a2a5d" dependencies = [ "anyhow", - "bincode", - "byteorder", - "csv", - "encoding_rs", - "encoding_rs_io", - "env_logger", - "glob", - "lindera-compress", "lindera-core", "lindera-decompress", - "log", - "serde", - "yada", + "lindera-dictionary-builder", ] [[package]] name = "lindera-ko-dic" -version = "0.30.0" +version = "0.31.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "59073171566c3e498ca048e84c2d0a7e117a42f36c8eb7d7163e65ac38bd6d48" +checksum = "7f86d03a863f3ae1d269e7b7d4dd2cce9385a53463479bafc5d7aa48719f36db" dependencies = [ "bincode", "byteorder", @@ -3037,29 +3033,21 @@ dependencies = [ [[package]] name = "lindera-ko-dic-builder" -version = "0.30.0" +version = "0.31.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae176afa8535ca2a5ee9471873f85d531db0a6c32a3c42b41084506aac22b577" +checksum = "bd0f44f2e56358c5879dfb5e7f76cc6ba7853ec31082c4e3f8fb65fb2d849c51" dependencies = [ "anyhow", - "bincode", - "byteorder", - "csv", - "encoding", - "env_logger", - "glob", - "lindera-compress", "lindera-core", "lindera-decompress", - "log", - "yada", + "lindera-dictionary-builder", ] [[package]] name = "lindera-tokenizer" -version = "0.30.0" +version = "0.31.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "457285bdde84571aa510c9e05371904305a55e8a541fa1473d4393062f06932d" +checksum = "7c5182735cdc2832ac757b31e8a5b150a3514357a30efe3dec212f8dcb06ba14" dependencies = [ "bincode", "lindera-core", @@ -3071,9 +3059,9 @@ dependencies = [ [[package]] name = "lindera-unidic" -version = "0.30.0" +version = "0.31.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5839980be552dfa639b70964c61914a9ad014148663679b0e148aa72e5e30f23" +checksum = "6c63da104728dd1cf14bfa564753cbfa996f6078ed2e23e31475bd1d639fc597" dependencies = [ "bincode", "byteorder", @@ -3089,22 +3077,14 @@ dependencies = [ [[package]] name = "lindera-unidic-builder" -version = "0.30.0" +version = "0.31.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dcaab8f061d5b944b1e424f49c7efbf8f276e8a72e4f4ff956d01e46d481f008" +checksum = "04acecbc068dac21766a1b7ed1f2608b6f250d10b4f8bff67abc2a00437a0974" dependencies = [ "anyhow", - "bincode", - "byteorder", - "csv", - "encoding", - "env_logger", - "glob", - "lindera-compress", "lindera-core", "lindera-decompress", - "log", - "yada", + "lindera-dictionary-builder", ] [[package]] @@ -4909,6 +4889,12 @@ version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" +[[package]] +name = "strsim" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" + [[package]] name = "strum" version = "0.26.2" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 4a08e6261..f23694d10 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -17,7 +17,7 @@ bincode = "1.3.3" bstr = "1.9.0" bytemuck = { version = "1.14.0", features = ["extern_crate_alloc"] } byteorder = "1.5.0" -charabia = { version = "0.8.10", default-features = false } +charabia = { version = "0.8.11", default-features = false } concat-arrays = "0.1.2" crossbeam-channel = "0.5.11" deserr = "0.6.1" From 4148fbbe8557dd3fa1d5a6d67d14665eca816e4c Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 21 May 2024 11:47:05 +0200 Subject: [PATCH 027/110] provide a method to get all the nested fields ids from a name --- milli/src/fields_ids_map.rs | 38 +++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/milli/src/fields_ids_map.rs b/milli/src/fields_ids_map.rs index 9c1c87f82..f9d7c3704 100644 --- a/milli/src/fields_ids_map.rs +++ b/milli/src/fields_ids_map.rs @@ -41,6 +41,16 @@ impl FieldsIdsMap { } } + /// Get the ids of a field and all its nested fields based on its name. + pub fn nested_ids(&self, name: &str) -> Vec { + self.names_ids + .range(name.to_string()..) + .take_while(|(key, _)| key.starts_with(name)) + .filter(|(key, _)| crate::is_faceted_by(key, name)) + .map(|(_name, id)| *id) + .collect() + } + /// Get the id of a field based on its name. pub fn id(&self, name: &str) -> Option { self.names_ids.get(name).copied() @@ -126,4 +136,32 @@ mod tests { assert_eq!(iter.next(), Some((3, "title"))); assert_eq!(iter.next(), None); } + + #[test] + fn nested_fields() { + let mut map = FieldsIdsMap::new(); + + assert_eq!(map.insert("id"), Some(0)); + assert_eq!(map.insert("doggo"), Some(1)); + assert_eq!(map.insert("doggo.name"), Some(2)); + assert_eq!(map.insert("doggolution"), Some(3)); + assert_eq!(map.insert("doggo.breed.name"), Some(4)); + assert_eq!(map.insert("description"), Some(5)); + + insta::assert_debug_snapshot!(map.nested_ids("doggo"), @r###" + [ + 1, + 4, + 2, + ] + "###); + + insta::assert_debug_snapshot!(map.nested_ids("doggo.breed"), @r###" + [ + 4, + ] + "###); + + insta::assert_debug_snapshot!(map.nested_ids("_vector"), @"[]"); + } } From 7a84697570c4f03f903328d0da7145941a1ef445 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 21 May 2024 17:08:45 +0200 Subject: [PATCH 028/110] never store the _vectors as searchable or faceted fields --- milli/src/fieldids_weights_map.rs | 10 ++- milli/src/index.rs | 101 +++++++++++++++++++++++++++++- milli/src/update/settings.rs | 22 +++++-- 3 files changed, 124 insertions(+), 9 deletions(-) diff --git a/milli/src/fieldids_weights_map.rs b/milli/src/fieldids_weights_map.rs index a737632a4..2bf828711 100644 --- a/milli/src/fieldids_weights_map.rs +++ b/milli/src/fieldids_weights_map.rs @@ -4,7 +4,7 @@ use std::collections::HashMap; use serde::{Deserialize, Serialize}; -use crate::{FieldId, FieldsIdsMap, Weight}; +use crate::{vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME, FieldId, FieldsIdsMap, Weight}; #[derive(Debug, Default, Serialize, Deserialize)] pub struct FieldidsWeightsMap { @@ -23,7 +23,13 @@ impl FieldidsWeightsMap { /// Should only be called in the case there are NO searchable attributes. /// All the fields will be inserted in the order of the fields ids map with a weight of 0. pub fn from_field_id_map_without_searchable(fid_map: &FieldsIdsMap) -> Self { - FieldidsWeightsMap { map: fid_map.ids().map(|fid| (fid, 0)).collect() } + FieldidsWeightsMap { + map: fid_map + .iter() + .filter(|(_fid, name)| !crate::is_faceted_by(name, RESERVED_VECTORS_FIELD_NAME)) + .map(|(fid, _name)| (fid, 0)) + .collect(), + } } /// Removes a field id from the map, returning the associated weight previously in the map. diff --git a/milli/src/index.rs b/milli/src/index.rs index 3c502d541..ef4936ed1 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -23,6 +23,7 @@ use crate::heed_codec::{ }; use crate::order_by_map::OrderByMap; use crate::proximity::ProximityPrecision; +use crate::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME; use crate::vector::{Embedding, EmbeddingConfig}; use crate::{ default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds, @@ -644,6 +645,7 @@ impl Index { &self, wtxn: &mut RwTxn, user_fields: &[&str], + non_searchable_fields_ids: &[FieldId], fields_ids_map: &FieldsIdsMap, ) -> Result<()> { // We can write the user defined searchable fields as-is. @@ -662,6 +664,7 @@ impl Index { for (weight, user_field) in user_fields.iter().enumerate() { if crate::is_faceted_by(field_from_map, user_field) && !real_fields.contains(&field_from_map) + && !non_searchable_fields_ids.contains(&id) { real_fields.push(field_from_map); @@ -708,6 +711,7 @@ impl Index { Ok(self .fields_ids_map(rtxn)? .names() + .filter(|name| !crate::is_faceted_by(name, RESERVED_VECTORS_FIELD_NAME)) .map(|field| Cow::Owned(field.to_string())) .collect()) }) @@ -1669,15 +1673,17 @@ pub(crate) mod tests { use big_s::S; use heed::{EnvOpenOptions, RwTxn}; - use maplit::hashset; + use maplit::{btreemap, hashset}; use tempfile::TempDir; use crate::documents::DocumentsBatchReader; use crate::error::{Error, InternalError}; use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS}; use crate::update::{ - self, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings, + self, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Setting, + Settings, }; + use crate::vector::settings::{EmbedderSource, EmbeddingSettings}; use crate::{db_snap, obkv_to_json, Filter, Index, Search, SearchResult}; pub(crate) struct TempIndex { @@ -2783,4 +2789,95 @@ pub(crate) mod tests { ] "###); } + + #[test] + fn vectors_are_never_indexed_as_searchable_or_filterable() { + let index = TempIndex::new(); + + index + .add_documents(documents!([ + { "id": 0, "_vectors": { "doggo": [2345] } }, + { "id": 1, "_vectors": { "doggo": [6789] } }, + ])) + .unwrap(); + + db_snap!(index, fields_ids_map, @r###" + 0 id | + 1 _vectors | + 2 _vectors.doggo | + "###); + db_snap!(index, searchable_fields, @r###"["id"]"###); + db_snap!(index, fieldids_weights_map, @r###" + fid weight + 0 0 | + "###); + + let rtxn = index.read_txn().unwrap(); + let mut search = index.search(&rtxn); + let results = search.query("2345").execute().unwrap(); + assert!(results.candidates.is_empty()); + drop(rtxn); + + index + .update_settings(|settings| { + settings.set_searchable_fields(vec![S("_vectors"), S("_vectors.doggo")]); + settings.set_filterable_fields(hashset![S("_vectors"), S("_vectors.doggo")]); + }) + .unwrap(); + + db_snap!(index, fields_ids_map, @r###" + 0 id | + 1 _vectors | + 2 _vectors.doggo | + "###); + db_snap!(index, searchable_fields, @"[]"); + db_snap!(index, fieldids_weights_map, @r###" + fid weight + "###); + + let rtxn = index.read_txn().unwrap(); + let mut search = index.search(&rtxn); + let results = search.query("2345").execute().unwrap(); + assert!(results.candidates.is_empty()); + + let mut search = index.search(&rtxn); + let results = search + .filter(Filter::from_str("_vectors.doggo = 6789").unwrap().unwrap()) + .execute() + .unwrap(); + assert!(results.candidates.is_empty()); + + index + .update_settings(|settings| { + settings.set_embedder_settings(btreemap! { + S("doggo") => Setting::Set(EmbeddingSettings { + dimensions: Setting::Set(1), + source: Setting::Set(EmbedderSource::UserProvided), + ..EmbeddingSettings::default()}), + }); + }) + .unwrap(); + + db_snap!(index, fields_ids_map, @r###" + 0 id | + 1 _vectors | + 2 _vectors.doggo | + "###); + db_snap!(index, searchable_fields, @"[]"); + db_snap!(index, fieldids_weights_map, @r###" + fid weight + "###); + + let rtxn = index.read_txn().unwrap(); + let mut search = index.search(&rtxn); + let results = search.query("2345").execute().unwrap(); + assert!(results.candidates.is_empty()); + + let mut search = index.search(&rtxn); + let results = search + .filter(Filter::from_str("_vectors.doggo = 6789").unwrap().unwrap()) + .execute() + .unwrap(); + assert!(results.candidates.is_empty()); + } } diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index be9b6b74e..68c31fabb 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -19,6 +19,7 @@ use crate::order_by_map::OrderByMap; use crate::proximity::ProximityPrecision; use crate::update::index_documents::IndexDocumentsMethod; use crate::update::{IndexDocuments, UpdateIndexingStep}; +use crate::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME; use crate::vector::settings::{check_set, check_unset, EmbedderSource, EmbeddingSettings}; use crate::vector::{Embedder, EmbeddingConfig, EmbeddingConfigs}; use crate::{FieldId, FieldsIdsMap, Index, Result}; @@ -490,6 +491,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { self.index.put_all_searchable_fields_from_fields_ids_map( self.wtxn, &names, + &fields_ids_map.nested_ids(RESERVED_VECTORS_FIELD_NAME), &fields_ids_map, )?; self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; @@ -1252,6 +1254,8 @@ pub(crate) struct InnerIndexSettings { pub embedding_configs: EmbeddingConfigs, pub existing_fields: HashSet, pub geo_fields_ids: Option<(FieldId, FieldId)>, + pub non_searchable_fields_ids: Vec, + pub non_faceted_fields_ids: Vec, } impl InnerIndexSettings { @@ -1265,8 +1269,8 @@ impl InnerIndexSettings { let user_defined_searchable_fields = user_defined_searchable_fields.map(|sf| sf.into_iter().map(String::from).collect()); let user_defined_faceted_fields = index.user_defined_faceted_fields(rtxn)?; - let searchable_fields_ids = index.searchable_fields_ids(rtxn)?; - let faceted_fields_ids = index.faceted_fields_ids(rtxn)?; + let mut searchable_fields_ids = index.searchable_fields_ids(rtxn)?; + let mut faceted_fields_ids = index.faceted_fields_ids(rtxn)?; let exact_attributes = index.exact_attributes_ids(rtxn)?; let proximity_precision = index.proximity_precision(rtxn)?.unwrap_or_default(); let embedding_configs = embedders(index.embedding_configs(rtxn)?)?; @@ -1294,6 +1298,10 @@ impl InnerIndexSettings { None => None, }; + let vectors_fids = fields_ids_map.nested_ids(RESERVED_VECTORS_FIELD_NAME); + searchable_fields_ids.retain(|id| !vectors_fids.contains(id)); + faceted_fields_ids.retain(|id| !vectors_fids.contains(id)); + Ok(Self { stop_words, allowed_separators, @@ -1308,6 +1316,8 @@ impl InnerIndexSettings { embedding_configs, existing_fields, geo_fields_ids, + non_searchable_fields_ids: vectors_fids.clone(), + non_faceted_fields_ids: vectors_fids.clone(), }) } @@ -1315,9 +1325,10 @@ impl InnerIndexSettings { pub fn recompute_facets(&mut self, wtxn: &mut heed::RwTxn, index: &Index) -> Result<()> { let new_facets = self .fields_ids_map - .names() - .filter(|&field| crate::is_faceted(field, &self.user_defined_faceted_fields)) - .map(|field| field.to_string()) + .iter() + .filter(|(fid, _field)| !self.non_faceted_fields_ids.contains(fid)) + .filter(|(_fid, field)| crate::is_faceted(field, &self.user_defined_faceted_fields)) + .map(|(_fid, field)| field.to_string()) .collect(); index.put_faceted_fields(wtxn, &new_facets)?; @@ -1337,6 +1348,7 @@ impl InnerIndexSettings { index.put_all_searchable_fields_from_fields_ids_map( wtxn, &searchable_fields, + &self.non_searchable_fields_ids, &self.fields_ids_map, )?; } From 84e498299bcc492ab91ecfefb989fdbd8ef897d8 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 22 May 2024 15:27:09 +0200 Subject: [PATCH 029/110] Remove the vectors from the documents database --- Cargo.lock | 2 + index-scheduler/Cargo.toml | 2 + index-scheduler/src/lib.rs | 287 +++++++++++++++++- .../documents after initial push.snap | 4 + .../documents after setting an embedder.snap | 4 + meilisearch-types/src/settings.rs | 2 +- milli/Cargo.toml | 2 +- milli/src/index.rs | 18 +- .../extract/extract_vector_points.rs | 46 ++- .../src/update/index_documents/extract/mod.rs | 4 + milli/src/update/index_documents/mod.rs | 9 +- .../src/update/index_documents/typed_chunk.rs | 20 +- milli/src/update/settings.rs | 40 ++- milli/src/vector/parsed_vectors.rs | 18 +- 14 files changed, 407 insertions(+), 51 deletions(-) create mode 100644 index-scheduler/src/snapshots/lib.rs/import_vectors_first_and_embedder_later/documents after initial push.snap create mode 100644 index-scheduler/src/snapshots/lib.rs/import_vectors_first_and_embedder_later/documents after setting an embedder.snap diff --git a/Cargo.lock b/Cargo.lock index b62a61f92..3b28a00e3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2455,6 +2455,7 @@ name = "index-scheduler" version = "1.9.0" dependencies = [ "anyhow", + "arroy", "big_s", "bincode", "crossbeam", @@ -2465,6 +2466,7 @@ dependencies = [ "file-store", "flate2", "insta", + "maplit", "meili-snap", "meilisearch-auth", "meilisearch-types", diff --git a/index-scheduler/Cargo.toml b/index-scheduler/Cargo.toml index 21fa34733..8959bb070 100644 --- a/index-scheduler/Cargo.toml +++ b/index-scheduler/Cargo.toml @@ -40,7 +40,9 @@ ureq = "2.9.7" uuid = { version = "1.6.1", features = ["serde", "v4"] } [dev-dependencies] +arroy = "0.3.1" big_s = "1.0.2" crossbeam = "0.8.4" insta = { version = "1.34.0", features = ["json", "redactions"] } +maplit = "1.0.2" meili-snap = { path = "../meili-snap" } diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index 8a1c2f540..ebeac30b3 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -1459,11 +1459,11 @@ impl IndexScheduler { // TODO: consider using a type alias or a struct embedder/template pub fn embedders( &self, - embedding_configs: Vec<(String, milli::vector::EmbeddingConfig)>, + embedding_configs: Vec<(String, milli::vector::EmbeddingConfig, RoaringBitmap)>, ) -> Result { let res: Result<_> = embedding_configs .into_iter() - .map(|(name, milli::vector::EmbeddingConfig { embedder_options, prompt })| { + .map(|(name, milli::vector::EmbeddingConfig { embedder_options, prompt }, _)| { let prompt = Arc::new(prompt.try_into().map_err(meilisearch_types::milli::Error::from)?); // optimistically return existing embedder @@ -1748,6 +1748,9 @@ mod tests { use meilisearch_types::milli::update::IndexDocumentsMethod::{ ReplaceDocuments, UpdateDocuments, }; + use meilisearch_types::milli::update::Setting; + use meilisearch_types::milli::vector::settings::EmbeddingSettings; + use meilisearch_types::settings::{Checked, Unchecked}; use meilisearch_types::tasks::IndexSwap; use meilisearch_types::VERSION_FILE_NAME; use tempfile::{NamedTempFile, TempDir}; @@ -3052,7 +3055,9 @@ mod tests { let rtxn = index.read_txn().unwrap(); let configs = index.embedding_configs(&rtxn).unwrap(); - let (_, embedding_config) = configs.first().unwrap(); + let (name, embedding_config, user_provided) = configs.first().unwrap(); + insta::assert_snapshot!(name, @"default"); + insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>"); insta::assert_json_snapshot!(embedding_config.embedder_options); } @@ -5017,13 +5022,15 @@ mod tests { let configs = index.embedding_configs(&rtxn).unwrap(); // for consistency with the below #[allow(clippy::get_first)] - let (name, fakerest_config) = configs.get(0).unwrap(); - insta::assert_json_snapshot!(name, @r###""A_fakerest""###); + let (name, fakerest_config, user_provided) = configs.get(0).unwrap(); + insta::assert_snapshot!(name, @"A_fakerest"); + insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>"); insta::assert_json_snapshot!(fakerest_config.embedder_options); let fakerest_name = name.clone(); - let (name, simple_hf_config) = configs.get(1).unwrap(); - insta::assert_json_snapshot!(name, @r###""B_small_hf""###); + let (name, simple_hf_config, user_provided) = configs.get(1).unwrap(); + insta::assert_snapshot!(name, @"B_small_hf"); + insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>"); insta::assert_json_snapshot!(simple_hf_config.embedder_options); let simple_hf_name = name.clone(); @@ -5091,6 +5098,18 @@ mod tests { let index = index_scheduler.index("doggos").unwrap(); let rtxn = index.read_txn().unwrap(); + // Ensure the document have been inserted into the relevant bitamp + let configs = index.embedding_configs(&rtxn).unwrap(); + // for consistency with the below + #[allow(clippy::get_first)] + let (name, _config, user_defined) = configs.get(0).unwrap(); + insta::assert_snapshot!(name, @"A_fakerest"); + insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[0]>"); + + let (name, _config, user_defined) = configs.get(1).unwrap(); + insta::assert_snapshot!(name, @"B_small_hf"); + insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[]>"); + let embeddings = index.embeddings(&rtxn, 0).unwrap(); assert_json_snapshot!(embeddings[&simple_hf_name][0] == lab_embed, @"true"); @@ -5153,6 +5172,18 @@ mod tests { let index = index_scheduler.index("doggos").unwrap(); let rtxn = index.read_txn().unwrap(); + // Ensure the document have been inserted into the relevant bitamp + let configs = index.embedding_configs(&rtxn).unwrap(); + // for consistency with the below + #[allow(clippy::get_first)] + let (name, _config, user_defined) = configs.get(0).unwrap(); + insta::assert_snapshot!(name, @"A_fakerest"); + insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[0]>"); + + let (name, _config, user_defined) = configs.get(1).unwrap(); + insta::assert_snapshot!(name, @"B_small_hf"); + insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[]>"); + let embeddings = index.embeddings(&rtxn, 0).unwrap(); // automatically changed to patou @@ -5176,4 +5207,246 @@ mod tests { } } } + + #[test] + fn import_vectors_first_and_embedder_later() { + let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); + + let content = serde_json::json!( + [ + { + "id": 0, + "doggo": "kefir", + }, + { + "id": 1, + "doggo": "intel", + "_vectors": { + "my_doggo_embedder": vec![1; 384], + "unknown embedder": vec![1, 2, 3], + } + }, + { + "id": 2, + "doggo": "max", + "_vectors": { + "my_doggo_embedder": { + "userProvided": true, + "embeddings": vec![2; 384], + }, + "unknown embedder": vec![4, 5], + }, + }, + { + "id": 3, + "doggo": "marcel", + "_vectors": { + "my_doggo_embedder": { + "userProvided": false, + "embeddings": vec![3; 384], + }, + }, + }, + { + "id": 4, + "doggo": "sora", + "_vectors": { + "my_doggo_embedder": { + "userProvided": false, + }, + }, + }, + ] + ); + + let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0 as u128).unwrap(); + let documents_count = + read_json(serde_json::to_string_pretty(&content).unwrap().as_bytes(), &mut file) + .unwrap(); + snapshot!(documents_count, @"5"); + file.persist().unwrap(); + + index_scheduler + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: None, + method: ReplaceDocuments, + content_file: uuid, + documents_count, + allow_index_creation: true, + }, + None, + false, + ) + .unwrap(); + index_scheduler.assert_internally_consistent(); + handle.advance_one_successful_batch(); + index_scheduler.assert_internally_consistent(); + + let index = index_scheduler.index("doggos").unwrap(); + let rtxn = index.read_txn().unwrap(); + let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let field_ids = field_ids_map.ids().collect::>(); + let documents = index + .all_documents(&rtxn) + .unwrap() + .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) + .collect::>(); + snapshot!(serde_json::to_string(&documents).unwrap(), name: "documents after initial push"); + + let mut setting = meilisearch_types::settings::Settings::::default(); + setting.embedders = Setting::Set(maplit::btreemap! { + S("my_doggo_embedder") => Setting::Set(EmbeddingSettings { + source: Setting::Set(milli::vector::settings::EmbedderSource::HuggingFace), + model: Setting::Set(S("sentence-transformers/all-MiniLM-L6-v2")), + revision: Setting::Set(S("e4ce9877abf3edfe10b0d82785e83bdcb973e22e")), + document_template: Setting::Set(S("{{doc.doggo}}")), + .. EmbeddingSettings::default() + }) + }); + index_scheduler + .register( + KindWithContent::SettingsUpdate { + index_uid: S("doggos"), + new_settings: Box::new(setting), + is_deletion: false, + allow_index_creation: false, + }, + None, + false, + ) + .unwrap(); + index_scheduler.assert_internally_consistent(); + handle.advance_one_successful_batch(); + index_scheduler.assert_internally_consistent(); + + let index = index_scheduler.index("doggos").unwrap(); + let rtxn = index.read_txn().unwrap(); + let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let field_ids = field_ids_map.ids().collect::>(); + let documents = index + .all_documents(&rtxn) + .unwrap() + .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) + .collect::>(); + // the all the vectors linked to the new specified embedder have been removed + // Only the unknown embedders stays in the document DB + snapshot!(serde_json::to_string(&documents).unwrap(), @r###"[{"id":0,"doggo":"kefir"},{"id":1,"doggo":"intel","_vectors":{"unknown embedder":[1.0,2.0,3.0]}},{"id":2,"doggo":"max","_vectors":{"unknown embedder":[4.0,5.0]}},{"id":3,"doggo":"marcel"},{"id":4,"doggo":"sora"}]"###); + let conf = index.embedding_configs(&rtxn).unwrap(); + // even though we specified the vector for the ID 3, it shouldn't be marked + // as user provided since we explicitely marked it as NOT user provided. + snapshot!(format!("{conf:#?}"), @r###" + [ + ( + "my_doggo_embedder", + EmbeddingConfig { + embedder_options: HuggingFace( + EmbedderOptions { + model: "sentence-transformers/all-MiniLM-L6-v2", + revision: Some( + "e4ce9877abf3edfe10b0d82785e83bdcb973e22e", + ), + distribution: None, + }, + ), + prompt: PromptData { + template: "{{doc.doggo}}", + }, + }, + RoaringBitmap<[1, 2]>, + ), + ] + "###); + let docid = index.external_documents_ids.get(&rtxn, "0").unwrap().unwrap(); + let embeddings = index.embeddings(&rtxn, docid).unwrap(); + let embedding = &embeddings["my_doggo_embedder"]; + assert!(!embedding.is_empty(), "{embedding:?}"); + + // the document with the id 3 should keep its original embedding + let docid = index.external_documents_ids.get(&rtxn, "3").unwrap().unwrap(); + let mut embeddings = Vec::new(); + + 'vectors: for i in 0..=u8::MAX { + let reader = arroy::Reader::open(&rtxn, 0 | (i as u16), index.vector_arroy) + .map(Some) + .or_else(|e| match e { + arroy::Error::MissingMetadata => Ok(None), + e => Err(e), + }) + .transpose(); + + let Some(reader) = reader else { + break 'vectors; + }; + + let embedding = reader.unwrap().item_vector(&rtxn, docid).unwrap(); + if let Some(embedding) = embedding { + embeddings.push(embedding) + } else { + break 'vectors; + } + } + + snapshot!(embeddings.len(), @"1"); + assert!(embeddings[0].iter().all(|i| *i == 3.0), "{:?}", embeddings[0]); + + // If we update marcel it should regenerate its embedding automatically + + let content = serde_json::json!( + [ + { + "id": 3, + "doggo": "marvel", + }, + { + "id": 4, + "doggo": "sorry", + }, + ] + ); + + let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(1 as u128).unwrap(); + let documents_count = + read_json(serde_json::to_string_pretty(&content).unwrap().as_bytes(), &mut file) + .unwrap(); + snapshot!(documents_count, @"2"); + file.persist().unwrap(); + + index_scheduler + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: None, + method: UpdateDocuments, + content_file: uuid, + documents_count, + allow_index_creation: true, + }, + None, + false, + ) + .unwrap(); + index_scheduler.assert_internally_consistent(); + handle.advance_one_successful_batch(); + index_scheduler.assert_internally_consistent(); + + // the document with the id 3 should have its original embedding updated + let docid = index.external_documents_ids.get(&rtxn, "3").unwrap().unwrap(); + let embeddings = index.embeddings(&rtxn, docid).unwrap(); + let embedding = &embeddings["my_doggo_embedder"]; + + assert!(!embedding.is_empty()); + /// TODO: it shouldn’t be equal to 3.0 + assert!(embedding[0].iter().all(|i| *i == 3.0), "{:?}", embedding[0]); + + // the document with the id 4 should generate an embedding + // let docid = index.external_documents_ids.get(&rtxn, "4").unwrap().unwrap(); + // let embeddings = index.embeddings(&rtxn, docid).unwrap(); + // dbg!(&embeddings); + // let embedding = &embeddings["my_doggo_embedder"]; + + // assert!(!embedding.is_empty()); + // assert!(embedding[0]); + } } diff --git a/index-scheduler/src/snapshots/lib.rs/import_vectors_first_and_embedder_later/documents after initial push.snap b/index-scheduler/src/snapshots/lib.rs/import_vectors_first_and_embedder_later/documents after initial push.snap new file mode 100644 index 000000000..433a190f9 --- /dev/null +++ b/index-scheduler/src/snapshots/lib.rs/import_vectors_first_and_embedder_later/documents after initial push.snap @@ -0,0 +1,4 @@ +--- +source: index-scheduler/src/lib.rs +--- +[{"id":0,"doggo":"kefir"},{"id":1,"doggo":"intel","_vectors":{"my_doggo_embedder":[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],"unknown embedder":[1.0,2.0,3.0]}},{"id":2,"doggo":"max","_vectors":{"my_doggo_embedder":{"embeddings":[2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0],"userProvided":true},"unknown embedder":[4.0,5.0]}},{"id":3,"doggo":"marcel","_vectors":{"my_doggo_embedder":{"embeddings":[3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0],"userProvided":false}}},{"id":4,"doggo":"sora","_vectors":{"my_doggo_embedder":{"embeddings":null,"userProvided":false}}}] diff --git a/index-scheduler/src/snapshots/lib.rs/import_vectors_first_and_embedder_later/documents after setting an embedder.snap b/index-scheduler/src/snapshots/lib.rs/import_vectors_first_and_embedder_later/documents after setting an embedder.snap new file mode 100644 index 000000000..853be8b0a --- /dev/null +++ b/index-scheduler/src/snapshots/lib.rs/import_vectors_first_and_embedder_later/documents after setting an embedder.snap @@ -0,0 +1,4 @@ +--- +source: index-scheduler/src/lib.rs +--- +[{"id":0,"doggo":"kefir"},{"id":1,"doggo":"intel","_vectors":{"unknown embedder":[1.0,2.0,3.0]}},{"id":2,"doggo":"max","_vectors":{"unknown embedder":[4.0,5.0]}},{"id":3,"doggo":"marcel"},{"id":4,"doggo":"sora"}] diff --git a/meilisearch-types/src/settings.rs b/meilisearch-types/src/settings.rs index 223d71658..d1d82be68 100644 --- a/meilisearch-types/src/settings.rs +++ b/meilisearch-types/src/settings.rs @@ -672,7 +672,7 @@ pub fn settings( let embedders: BTreeMap<_, _> = index .embedding_configs(rtxn)? .into_iter() - .map(|(name, config)| (name, Setting::Set(config.into()))) + .map(|(name, config, _)| (name, Setting::Set(config.into()))) .collect(); let embedders = if embedders.is_empty() { Setting::NotSet } else { Setting::Set(embedders) }; diff --git a/milli/Cargo.toml b/milli/Cargo.toml index f23694d10..7fba2af1e 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -44,7 +44,7 @@ once_cell = "1.19.0" ordered-float = "4.2.0" rand_pcg = { version = "0.3.1", features = ["serde1"] } rayon = "1.8.0" -roaring = "0.10.2" +roaring = { version = "0.10.2", features = ["serde"] } rstar = { version = "0.11.0", features = ["serde"] } serde = { version = "1.0.195", features = ["derive"] } serde_json = { version = "1.0.111", features = ["preserve_order"] } diff --git a/milli/src/index.rs b/milli/src/index.rs index ef4936ed1..569a9a692 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -1572,16 +1572,18 @@ impl Index { Ok(script_language) } + /// Put the embedding configs: + /// 1. The name of the embedder + /// 2. The configuration option for this embedder + /// 3. The list of documents with a user provided embedding pub(crate) fn put_embedding_configs( &self, wtxn: &mut RwTxn<'_>, - configs: Vec<(String, EmbeddingConfig)>, + configs: Vec<(String, EmbeddingConfig, RoaringBitmap)>, ) -> heed::Result<()> { - self.main.remap_types::>>().put( - wtxn, - main_key::EMBEDDING_CONFIGS, - &configs, - ) + self.main + .remap_types::>>() + .put(wtxn, main_key::EMBEDDING_CONFIGS, &configs) } pub(crate) fn delete_embedding_configs(&self, wtxn: &mut RwTxn<'_>) -> heed::Result { @@ -1591,10 +1593,10 @@ impl Index { pub fn embedding_configs( &self, rtxn: &RoTxn<'_>, - ) -> Result> { + ) -> Result> { Ok(self .main - .remap_types::>>() + .remap_types::>>() .get(rtxn, main_key::EMBEDDING_CONFIGS)? .unwrap_or_default()) } diff --git a/milli/src/update/index_documents/extract/extract_vector_points.rs b/milli/src/update/index_documents/extract/extract_vector_points.rs index 76ec90d65..d97d1403c 100644 --- a/milli/src/update/index_documents/extract/extract_vector_points.rs +++ b/milli/src/update/index_documents/extract/extract_vector_points.rs @@ -10,16 +10,16 @@ use bytemuck::cast_slice; use grenad::Writer; use itertools::EitherOrBoth; use ordered_float::OrderedFloat; +use roaring::RoaringBitmap; use serde_json::Value; use super::helpers::{create_writer, writer_into_reader, GrenadParameters}; use crate::prompt::Prompt; use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; -use crate::update::index_documents::helpers::try_split_at; use crate::update::settings::InnerIndexSettingsDiff; use crate::vector::parsed_vectors::{ParsedVectorsDiff, RESERVED_VECTORS_FIELD_NAME}; use crate::vector::Embedder; -use crate::{DocumentId, Result, ThreadPoolNoAbort}; +use crate::{try_split_array_at, DocumentId, Result, ThreadPoolNoAbort}; /// The length of the elements that are always in the buffer when inserting new values. const TRUNCATE_SIZE: usize = size_of::(); @@ -35,6 +35,8 @@ pub struct ExtractedVectorPoints { // embedder pub embedder_name: String, pub embedder: Arc, + pub user_defined: RoaringBitmap, + pub remove_from_user_defined: RoaringBitmap, } enum VectorStateDelta { @@ -80,6 +82,11 @@ struct EmbedderVectorExtractor { prompts_writer: Writer>, // (docid) -> () remove_vectors_writer: Writer>, + + // The docids of the documents that contains a user defined embedding + user_defined: RoaringBitmap, + // The docids of the documents that contains an auto-generated embedding + remove_from_user_defined: RoaringBitmap, } /// Extracts the embedding vector contained in each document under the `_vectors` field. @@ -134,6 +141,8 @@ pub fn extract_vector_points( manual_vectors_writer, prompts_writer, remove_vectors_writer, + user_defined: RoaringBitmap::new(), + remove_from_user_defined: RoaringBitmap::new(), }); } @@ -141,13 +150,15 @@ pub fn extract_vector_points( let mut cursor = obkv_documents.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { // this must always be serialized as (docid, external_docid); + const SIZE_OF_DOCUMENTID: usize = std::mem::size_of::(); let (docid_bytes, external_id_bytes) = - try_split_at(key, std::mem::size_of::()).unwrap(); + try_split_array_at::(key).unwrap(); debug_assert!(from_utf8(external_id_bytes).is_ok()); + let docid = DocumentId::from_be_bytes(docid_bytes); let obkv = obkv::KvReader::new(value); key_buffer.clear(); - key_buffer.extend_from_slice(docid_bytes); + key_buffer.extend_from_slice(docid_bytes.as_slice()); // since we only need the primary key when we throw an error we create this getter to // lazily get it when needed @@ -163,10 +174,22 @@ pub fn extract_vector_points( manual_vectors_writer, prompts_writer, remove_vectors_writer, + user_defined, + remove_from_user_defined, } in extractors.iter_mut() { let delta = match parsed_vectors.remove(embedder_name) { (Some(old), Some(new)) => { + match (old.is_user_provided(), new.is_user_provided()) { + (true, true) | (false, false) => (), + (true, false) => { + remove_from_user_defined.insert(docid); + } + (false, true) => { + user_defined.insert(docid); + } + } + // no autogeneration let del_vectors = old.into_array_of_vectors(); let add_vectors = new.into_array_of_vectors(); @@ -187,6 +210,7 @@ pub fn extract_vector_points( .map(|(_, deladd)| KvReaderDelAdd::new(deladd)) .any(|deladd| deladd.get(DelAdd::Addition).is_some()); if document_is_kept { + remove_from_user_defined.insert(docid); // becomes autogenerated VectorStateDelta::NowGenerated(prompt.render( obkv, @@ -198,6 +222,11 @@ pub fn extract_vector_points( } } (None, Some(new)) => { + if new.is_user_provided() { + user_defined.insert(docid); + } else { + remove_from_user_defined.insert(docid); + } // was possibly autogenerated, remove all vectors for that document let add_vectors = new.into_array_of_vectors(); if add_vectors.len() > usize::from(u8::MAX) { @@ -239,6 +268,7 @@ pub fn extract_vector_points( VectorStateDelta::NoChange } } else { + remove_from_user_defined.remove(docid); VectorStateDelta::NowRemoved } } @@ -265,18 +295,18 @@ pub fn extract_vector_points( manual_vectors_writer, prompts_writer, remove_vectors_writer, + user_defined, + remove_from_user_defined, } in extractors { results.push(ExtractedVectorPoints { - // docid, _index -> KvWriterDelAdd -> Vector manual_vectors: writer_into_reader(manual_vectors_writer)?, - // docid -> () remove_vectors: writer_into_reader(remove_vectors_writer)?, - // docid -> prompt prompts: writer_into_reader(prompts_writer)?, - embedder, embedder_name, + user_defined, + remove_from_user_defined, }) } diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 18340a3ae..80214e7c8 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -238,6 +238,8 @@ fn send_original_documents_data( prompts, embedder_name, embedder, + user_defined, + remove_from_user_defined: auto_generated, } in extracted_vectors { let embeddings = match extract_embeddings( @@ -262,6 +264,8 @@ fn send_original_documents_data( expected_dimension: embedder.dimensions(), manual_vectors, embedder_name, + user_defined, + remove_from_user_defined: auto_generated, })); } } diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 2420463b4..a03e4333e 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -501,6 +501,8 @@ where embeddings, manual_vectors, embedder_name, + user_defined, + remove_from_user_defined, } => { dimension.insert(embedder_name.clone(), expected_dimension); TypedChunk::VectorPoints { @@ -509,6 +511,8 @@ where expected_dimension, manual_vectors, embedder_name, + user_defined, + remove_from_user_defined, } } otherwise => otherwise, @@ -2616,10 +2620,11 @@ mod tests { let rtxn = index.read_txn().unwrap(); let mut embedding_configs = index.embedding_configs(&rtxn).unwrap(); - let (embedder_name, embedder) = embedding_configs.pop().unwrap(); + let (embedder_name, embedder, user_defined) = embedding_configs.pop().unwrap(); + insta::assert_snapshot!(embedder_name, @"manual"); + insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[0, 1, 2]>"); let embedder = std::sync::Arc::new(crate::vector::Embedder::new(embedder.embedder_options).unwrap()); - assert_eq!("manual", embedder_name); let res = index .search(&rtxn) .semantic(embedder_name, embedder, Some([0.0, 1.0, 2.0].to_vec())) diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 2fbe91685..2c4e17858 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -90,6 +90,8 @@ pub(crate) enum TypedChunk { expected_dimension: usize, manual_vectors: grenad::Reader>, embedder_name: String, + user_defined: RoaringBitmap, + remove_from_user_defined: RoaringBitmap, }, ScriptLanguageDocids(HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>), } @@ -155,7 +157,7 @@ pub(crate) fn write_typed_chunk_into_index( let mut iter = merger.into_stream_merger_iter()?; let embedders: BTreeSet<_> = - index.embedding_configs(wtxn)?.into_iter().map(|(k, _v)| k).collect(); + index.embedding_configs(wtxn)?.into_iter().map(|(name, _, _)| name).collect(); let mut vectors_buffer = Vec::new(); while let Some((key, reader)) = iter.next()? { let mut writer: KvWriter<_, FieldId> = KvWriter::memory(); @@ -181,7 +183,7 @@ pub(crate) fn write_typed_chunk_into_index( // if the `_vectors` field cannot be parsed as map of vectors, just write it as-is break 'vectors Some(addition); }; - vectors.retain_user_provided_vectors(&embedders); + vectors.retain_not_embedded_vectors(&embedders); let crate::vector::parsed_vectors::ParsedVectors(vectors) = vectors; if vectors.is_empty() { // skip writing empty `_vectors` map @@ -619,6 +621,8 @@ pub(crate) fn write_typed_chunk_into_index( let mut remove_vectors_builder = MergerBuilder::new(keep_first as MergeFn); let mut manual_vectors_builder = MergerBuilder::new(keep_first as MergeFn); let mut embeddings_builder = MergerBuilder::new(keep_first as MergeFn); + let mut user_defined = RoaringBitmap::new(); + let mut remove_from_user_defined = RoaringBitmap::new(); let mut params = None; for typed_chunk in typed_chunks { let TypedChunk::VectorPoints { @@ -627,6 +631,8 @@ pub(crate) fn write_typed_chunk_into_index( embeddings, expected_dimension, embedder_name, + user_defined: ud, + remove_from_user_defined: rud, } = typed_chunk else { unreachable!(); @@ -639,11 +645,21 @@ pub(crate) fn write_typed_chunk_into_index( if let Some(embeddings) = embeddings { embeddings_builder.push(embeddings.into_cursor()?); } + user_defined |= ud; + remove_from_user_defined |= rud; } // typed chunks has always at least 1 chunk. let Some((expected_dimension, embedder_name)) = params else { unreachable!() }; + let mut embedding_configs = index.embedding_configs(&wtxn)?; + let (_name, _conf, ud) = + embedding_configs.iter_mut().find(|config| config.0 == embedder_name).unwrap(); + *ud -= remove_from_user_defined; + *ud |= user_defined; + + index.put_embedding_configs(wtxn, embedding_configs)?; + let embedder_index = index.embedder_category_id.get(wtxn, &embedder_name)?.ok_or( InternalError::DatabaseMissingEntry { db_name: "embedder_category_id", key: None }, )?; diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 68c31fabb..64998bcc3 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -6,6 +6,7 @@ use std::sync::Arc; use charabia::{Normalize, Tokenizer, TokenizerBuilder}; use deserr::{DeserializeError, Deserr}; use itertools::{EitherOrBoth, Itertools}; +use roaring::RoaringBitmap; use serde::{Deserialize, Deserializer, Serialize, Serializer}; use time::OffsetDateTime; @@ -926,8 +927,13 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { Setting::Set(configs) => { let mut changed = false; let old_configs = self.index.embedding_configs(self.wtxn)?; - let old_configs: BTreeMap> = - old_configs.into_iter().map(|(k, v)| (k, Setting::Set(v.into()))).collect(); + let old_configs: BTreeMap, RoaringBitmap)> = + old_configs + .into_iter() + .map(|(name, setting, user_defined)| { + (name, (Setting::Set(setting.into()), user_defined)) + }) + .collect(); let mut new_configs = BTreeMap::new(); for joined in old_configs @@ -936,15 +942,19 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { { match joined { // updated config - EitherOrBoth::Both((name, mut old), (_, new)) => { + EitherOrBoth::Both((name, (mut old, user_defined)), (_, new)) => { changed |= EmbeddingSettings::apply_and_need_reindex(&mut old, new); if changed { - tracing::debug!(embedder = name, "need reindex"); + tracing::debug!( + embedder = name, + documents = user_defined.len(), + "need reindex" + ); } else { tracing::debug!(embedder = name, "skip reindex"); } let new = validate_embedding_settings(old, &name)?; - new_configs.insert(name, new); + new_configs.insert(name, (new, user_defined)); } // unchanged config EitherOrBoth::Left((name, setting)) => { @@ -961,21 +971,23 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { ); let setting = validate_embedding_settings(setting, &name)?; changed = true; - new_configs.insert(name, setting); + new_configs.insert(name, (setting, RoaringBitmap::new())); } } } - let new_configs: Vec<(String, EmbeddingConfig)> = new_configs + let new_configs: Vec<(String, EmbeddingConfig, RoaringBitmap)> = new_configs .into_iter() - .filter_map(|(name, setting)| match setting { - Setting::Set(value) => Some((name, value.into())), + .filter_map(|(name, (setting, user_defined))| match setting { + Setting::Set(settings) => Some((name, settings.into(), user_defined)), Setting::Reset => None, - Setting::NotSet => Some((name, EmbeddingSettings::default().into())), + Setting::NotSet => { + Some((name, EmbeddingSettings::default().into(), user_defined)) + } }) .collect(); self.index.embedder_category_id.clear(self.wtxn)?; - for (index, (embedder_name, _)) in new_configs.iter().enumerate() { + for (index, (embedder_name, _, _)) in new_configs.iter().enumerate() { self.index.embedder_category_id.put_with_flags( self.wtxn, heed::PutFlags::APPEND, @@ -1359,10 +1371,12 @@ impl InnerIndexSettings { } } -fn embedders(embedding_configs: Vec<(String, EmbeddingConfig)>) -> Result { +fn embedders( + embedding_configs: Vec<(String, EmbeddingConfig, RoaringBitmap)>, +) -> Result { let res: Result<_> = embedding_configs .into_iter() - .map(|(name, EmbeddingConfig { embedder_options, prompt })| { + .map(|(name, EmbeddingConfig { embedder_options, prompt }, _)| { let prompt = Arc::new(prompt.try_into().map_err(crate::Error::from)?); let embedder = Arc::new( diff --git a/milli/src/vector/parsed_vectors.rs b/milli/src/vector/parsed_vectors.rs index 2c61baa9e..62c418149 100644 --- a/milli/src/vector/parsed_vectors.rs +++ b/milli/src/vector/parsed_vectors.rs @@ -17,6 +17,13 @@ pub enum Vectors { } impl Vectors { + pub fn is_user_provided(&self) -> bool { + match self { + Vectors::ImplicitlyUserProvided(_) => true, + Vectors::Explicit(ExplicitVectors { user_provided, .. }) => *user_provided, + } + } + pub fn into_array_of_vectors(self) -> Vec { match self { Vectors::ImplicitlyUserProvided(embeddings) @@ -89,15 +96,8 @@ impl ParsedVectors { Ok(ParsedVectors(value)) } - pub fn retain_user_provided_vectors(&mut self, embedders: &BTreeSet) { - self.0.retain(|k, v| match v { - Vectors::ImplicitlyUserProvided(_) => true, - Vectors::Explicit(ExplicitVectors { embeddings: _, user_provided }) => { - *user_provided - // if the embedder is not in the config, then never touch it - || !embedders.contains(k) - } - }); + pub fn retain_not_embedded_vectors(&mut self, embedders: &BTreeSet) { + self.0.retain(|k, _v| !embedders.contains(k)) } } From 30d66abf8d92e0e5da0206b29457f2ab2f972b10 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 28 May 2024 18:07:53 +0200 Subject: [PATCH 030/110] fix the test --- index-scheduler/src/lib.rs | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index ebeac30b3..29b7c861f 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -1750,7 +1750,7 @@ mod tests { }; use meilisearch_types::milli::update::Setting; use meilisearch_types::milli::vector::settings::EmbeddingSettings; - use meilisearch_types::settings::{Checked, Unchecked}; + use meilisearch_types::settings::Unchecked; use meilisearch_types::tasks::IndexSwap; use meilisearch_types::VERSION_FILE_NAME; use tempfile::{NamedTempFile, TempDir}; @@ -5432,21 +5432,29 @@ mod tests { index_scheduler.assert_internally_consistent(); // the document with the id 3 should have its original embedding updated + let rtxn = index.read_txn().unwrap(); let docid = index.external_documents_ids.get(&rtxn, "3").unwrap().unwrap(); + let doc = index.documents(&rtxn, Some(docid)).unwrap()[0]; + let doc = obkv_to_json(&field_ids, &field_ids_map, doc.1).unwrap(); + snapshot!(json_string!(doc), @r###" + { + "id": 3, + "doggo": "marvel" + } + "###); + let embeddings = index.embeddings(&rtxn, docid).unwrap(); let embedding = &embeddings["my_doggo_embedder"]; assert!(!embedding.is_empty()); - /// TODO: it shouldn’t be equal to 3.0 - assert!(embedding[0].iter().all(|i| *i == 3.0), "{:?}", embedding[0]); + assert!(!embedding[0].iter().all(|i| *i == 3.0), "{:?}", embedding[0]); // the document with the id 4 should generate an embedding - // let docid = index.external_documents_ids.get(&rtxn, "4").unwrap().unwrap(); - // let embeddings = index.embeddings(&rtxn, docid).unwrap(); - // dbg!(&embeddings); - // let embedding = &embeddings["my_doggo_embedder"]; + let docid = index.external_documents_ids.get(&rtxn, "4").unwrap().unwrap(); + let embeddings = index.embeddings(&rtxn, docid).unwrap(); + dbg!(&embeddings); + let embedding = &embeddings["my_doggo_embedder"]; - // assert!(!embedding.is_empty()); - // assert!(embedding[0]); + assert!(!embedding.is_empty()); } } From 04f6523f3c90e16068c8b540853c24a2e19ea597 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 29 May 2024 17:22:58 +0200 Subject: [PATCH 031/110] expose a new parameter to retrieve the embedders at search time --- index-scheduler/src/lib.rs | 42 ++++++++++--------- meilisearch-types/src/error.rs | 2 + .../src/analytics/segment_analytics.rs | 3 ++ .../src/routes/indexes/facet_search.rs | 1 + meilisearch/src/routes/indexes/search.rs | 3 ++ meilisearch/src/routes/indexes/similar.rs | 10 ++--- meilisearch/src/search.rs | 35 +++++++++++++++- meilisearch/tests/search/hybrid.rs | 6 +-- meilisearch/tests/similar/mod.rs | 8 ++-- milli/src/vector/rest.rs | 2 + 10 files changed, 79 insertions(+), 33 deletions(-) diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index 29b7c861f..c76a207f5 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -5045,25 +5045,25 @@ mod tests { // add one doc, specifying vectors let doc = serde_json::json!( - { - "id": 0, - "doggo": "Intel", - "breed": "beagle", - "_vectors": { - &fakerest_name: { - // this will never trigger regeneration, which is good because we can't actually generate with - // this embedder - "userProvided": true, - "embeddings": beagle_embed, - }, - &simple_hf_name: { - // this will be regenerated on updates - "userProvided": false, - "embeddings": lab_embed, - }, - "noise": [0.1, 0.2, 0.3] - } - } + { + "id": 0, + "doggo": "Intel", + "breed": "beagle", + "_vectors": { + &fakerest_name: { + // this will never trigger regeneration, which is good because we can't actually generate with + // this embedder + "userProvided": true, + "embeddings": beagle_embed, + }, + &simple_hf_name: { + // this will be regenerated on updates + "userProvided": false, + "embeddings": lab_embed, + }, + "noise": [0.1, 0.2, 0.3] + } + } ); let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0u128).unwrap(); @@ -5163,7 +5163,9 @@ mod tests { snapshot!(snapshot_index_scheduler(&index_scheduler), name: "Intel to kefir"); - handle.advance_one_successful_batch(); + println!("HEEEEERE"); + // handle.advance_one_successful_batch(); + handle.advance_one_failed_batch(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "Intel to kefir succeeds"); { diff --git a/meilisearch-types/src/error.rs b/meilisearch-types/src/error.rs index 150c56b9d..63543fb1b 100644 --- a/meilisearch-types/src/error.rs +++ b/meilisearch-types/src/error.rs @@ -240,9 +240,11 @@ InvalidSearchAttributesToSearchOn , InvalidRequest , BAD_REQUEST ; InvalidSearchAttributesToCrop , InvalidRequest , BAD_REQUEST ; InvalidSearchAttributesToHighlight , InvalidRequest , BAD_REQUEST ; InvalidSimilarAttributesToRetrieve , InvalidRequest , BAD_REQUEST ; +InvalidSimilarRetrieveVectors , InvalidRequest , BAD_REQUEST ; InvalidSearchAttributesToRetrieve , InvalidRequest , BAD_REQUEST ; InvalidSearchRankingScoreThreshold , InvalidRequest , BAD_REQUEST ; InvalidSimilarRankingScoreThreshold , InvalidRequest , BAD_REQUEST ; +InvalidSearchRetrieveVectors , InvalidRequest , BAD_REQUEST ; InvalidSearchCropLength , InvalidRequest , BAD_REQUEST ; InvalidSearchCropMarker , InvalidRequest , BAD_REQUEST ; InvalidSearchFacets , InvalidRequest , BAD_REQUEST ; diff --git a/meilisearch/src/analytics/segment_analytics.rs b/meilisearch/src/analytics/segment_analytics.rs index aed29e612..3eb74c7d1 100644 --- a/meilisearch/src/analytics/segment_analytics.rs +++ b/meilisearch/src/analytics/segment_analytics.rs @@ -662,6 +662,7 @@ impl SearchAggregator { page, hits_per_page, attributes_to_retrieve: _, + retrieve_vectors: _, attributes_to_crop: _, crop_length, attributes_to_highlight: _, @@ -1079,6 +1080,7 @@ impl MultiSearchAggregator { page: _, hits_per_page: _, attributes_to_retrieve: _, + retrieve_vectors: _, attributes_to_crop: _, crop_length: _, attributes_to_highlight: _, @@ -1646,6 +1648,7 @@ impl SimilarAggregator { offset, limit, attributes_to_retrieve: _, + retrieve_vectors: _, show_ranking_score, show_ranking_score_details, filter, diff --git a/meilisearch/src/routes/indexes/facet_search.rs b/meilisearch/src/routes/indexes/facet_search.rs index 10b371f2d..2e9cf6e1b 100644 --- a/meilisearch/src/routes/indexes/facet_search.rs +++ b/meilisearch/src/routes/indexes/facet_search.rs @@ -115,6 +115,7 @@ impl From for SearchQuery { page: None, hits_per_page: None, attributes_to_retrieve: None, + retrieve_vectors: false, attributes_to_crop: None, crop_length: DEFAULT_CROP_LENGTH(), attributes_to_highlight: None, diff --git a/meilisearch/src/routes/indexes/search.rs b/meilisearch/src/routes/indexes/search.rs index 348d8295c..91c8c8178 100644 --- a/meilisearch/src/routes/indexes/search.rs +++ b/meilisearch/src/routes/indexes/search.rs @@ -51,6 +51,8 @@ pub struct SearchQueryGet { hits_per_page: Option>, #[deserr(default, error = DeserrQueryParamError)] attributes_to_retrieve: Option>, + #[deserr(default, error = DeserrQueryParamError)] + retrieve_vectors: bool, #[deserr(default, error = DeserrQueryParamError)] attributes_to_crop: Option>, #[deserr(default = Param(DEFAULT_CROP_LENGTH()), error = DeserrQueryParamError)] @@ -153,6 +155,7 @@ impl From for SearchQuery { page: other.page.as_deref().copied(), hits_per_page: other.hits_per_page.as_deref().copied(), attributes_to_retrieve: other.attributes_to_retrieve.map(|o| o.into_iter().collect()), + retrieve_vectors: other.retrieve_vectors, attributes_to_crop: other.attributes_to_crop.map(|o| o.into_iter().collect()), crop_length: other.crop_length.0, attributes_to_highlight: other.attributes_to_highlight.map(|o| o.into_iter().collect()), diff --git a/meilisearch/src/routes/indexes/similar.rs b/meilisearch/src/routes/indexes/similar.rs index 518fedab7..54ea912ec 100644 --- a/meilisearch/src/routes/indexes/similar.rs +++ b/meilisearch/src/routes/indexes/similar.rs @@ -4,11 +4,7 @@ use deserr::actix_web::{AwebJson, AwebQueryParameter}; use index_scheduler::IndexScheduler; use meilisearch_types::deserr::query_params::Param; use meilisearch_types::deserr::{DeserrJsonError, DeserrQueryParamError}; -use meilisearch_types::error::deserr_codes::{ - InvalidEmbedder, InvalidSimilarAttributesToRetrieve, InvalidSimilarFilter, InvalidSimilarId, - InvalidSimilarLimit, InvalidSimilarOffset, InvalidSimilarRankingScoreThreshold, - InvalidSimilarShowRankingScore, InvalidSimilarShowRankingScoreDetails, -}; +use meilisearch_types::error::deserr_codes::*; use meilisearch_types::error::{ErrorCode as _, ResponseError}; use meilisearch_types::index_uid::IndexUid; use meilisearch_types::keys::actions; @@ -122,6 +118,8 @@ pub struct SimilarQueryGet { limit: Param, #[deserr(default, error = DeserrQueryParamError)] attributes_to_retrieve: Option>, + #[deserr(default, error = DeserrQueryParamError)] + retrieve_vectors: Param, #[deserr(default, error = DeserrQueryParamError)] filter: Option, #[deserr(default, error = DeserrQueryParamError)] @@ -156,6 +154,7 @@ impl TryFrom for SimilarQuery { offset, limit, attributes_to_retrieve, + retrieve_vectors, filter, show_ranking_score, show_ranking_score_details, @@ -180,6 +179,7 @@ impl TryFrom for SimilarQuery { filter, embedder, attributes_to_retrieve: attributes_to_retrieve.map(|o| o.into_iter().collect()), + retrieve_vectors: retrieve_vectors.0, show_ranking_score: show_ranking_score.0, show_ranking_score_details: show_ranking_score_details.0, ranking_score_threshold: ranking_score_threshold.map(|x| x.0), diff --git a/meilisearch/src/search.rs b/meilisearch/src/search.rs index 05b3c1aff..1ab42a79f 100644 --- a/meilisearch/src/search.rs +++ b/meilisearch/src/search.rs @@ -59,6 +59,8 @@ pub struct SearchQuery { pub hits_per_page: Option, #[deserr(default, error = DeserrJsonError)] pub attributes_to_retrieve: Option>, + #[deserr(default, error = DeserrJsonError)] + pub retrieve_vectors: bool, #[deserr(default, error = DeserrJsonError)] pub attributes_to_crop: Option>, #[deserr(default, error = DeserrJsonError, default = DEFAULT_CROP_LENGTH())] @@ -141,6 +143,7 @@ impl fmt::Debug for SearchQuery { page, hits_per_page, attributes_to_retrieve, + retrieve_vectors, attributes_to_crop, crop_length, attributes_to_highlight, @@ -173,6 +176,9 @@ impl fmt::Debug for SearchQuery { if let Some(q) = q { debug.field("q", &q); } + if *retrieve_vectors { + debug.field("retrieve_vectors", &retrieve_vectors); + } if let Some(v) = vector { if v.len() < 10 { debug.field("vector", &v); @@ -370,6 +376,8 @@ pub struct SearchQueryWithIndex { pub hits_per_page: Option, #[deserr(default, error = DeserrJsonError)] pub attributes_to_retrieve: Option>, + #[deserr(default, error = DeserrJsonError)] + pub retrieve_vectors: bool, #[deserr(default, error = DeserrJsonError)] pub attributes_to_crop: Option>, #[deserr(default, error = DeserrJsonError, default = DEFAULT_CROP_LENGTH())] @@ -413,6 +421,7 @@ impl SearchQueryWithIndex { page, hits_per_page, attributes_to_retrieve, + retrieve_vectors, attributes_to_crop, crop_length, attributes_to_highlight, @@ -440,6 +449,7 @@ impl SearchQueryWithIndex { page, hits_per_page, attributes_to_retrieve, + retrieve_vectors, attributes_to_crop, crop_length, attributes_to_highlight, @@ -478,6 +488,8 @@ pub struct SimilarQuery { pub embedder: Option, #[deserr(default, error = DeserrJsonError)] pub attributes_to_retrieve: Option>, + #[deserr(default, error = DeserrJsonError)] + pub retrieve_vectors: bool, #[deserr(default, error = DeserrJsonError, default)] pub show_ranking_score: bool, #[deserr(default, error = DeserrJsonError, default)] @@ -847,6 +859,7 @@ pub fn perform_search( page, hits_per_page, attributes_to_retrieve, + retrieve_vectors, attributes_to_crop, crop_length, attributes_to_highlight, @@ -870,6 +883,7 @@ pub fn perform_search( let format = AttributesFormat { attributes_to_retrieve, + retrieve_vectors, attributes_to_highlight, attributes_to_crop, crop_length, @@ -953,6 +967,7 @@ pub fn perform_search( struct AttributesFormat { attributes_to_retrieve: Option>, + retrieve_vectors: bool, attributes_to_highlight: Option>, attributes_to_crop: Option>, crop_length: usize, @@ -1000,6 +1015,9 @@ fn make_hits( .intersection(&displayed_ids) .cloned() .collect(); + let is_vectors_displayed = + fields_ids_map.id("_vectors").is_some_and(|fid| displayed_ids.contains(&fid)); + let retrieve_vectors = format.retrieve_vectors && is_vectors_displayed; let attr_to_highlight = format.attributes_to_highlight.unwrap_or_default(); let attr_to_crop = format.attributes_to_crop.unwrap_or_default(); let formatted_options = compute_formatted_options( @@ -1034,7 +1052,7 @@ fn make_hits( formatter_builder.highlight_suffix(format.highlight_post_tag); let mut documents = Vec::new(); let documents_iter = index.documents(rtxn, documents_ids)?; - for ((_id, obkv), score) in documents_iter.into_iter().zip(document_scores.into_iter()) { + for ((id, obkv), score) in documents_iter.into_iter().zip(document_scores.into_iter()) { // First generate a document with all the displayed fields let displayed_document = make_document(&displayed_ids, &fields_ids_map, obkv)?; @@ -1045,6 +1063,19 @@ fn make_hits( let mut document = permissive_json_pointer::select_values(&displayed_document, attributes_to_retrieve); + if retrieve_vectors { + let mut vectors = serde_json::Map::new(); + for (name, mut vector) in index.embeddings(&rtxn, id)? { + if vector.len() == 1 { + let vector = vector.pop().unwrap(); + vectors.insert(name.into(), vector.into()); + } else { + vectors.insert(name.into(), vector.into()); + } + } + document.insert("_vectors".into(), vectors.into()); + } + let (matches_position, formatted) = format_fields( &displayed_document, &fields_ids_map, @@ -1125,6 +1156,7 @@ pub fn perform_similar( filter: _, embedder: _, attributes_to_retrieve, + retrieve_vectors, show_ranking_score, show_ranking_score_details, ranking_score_threshold, @@ -1171,6 +1203,7 @@ pub fn perform_similar( let format = AttributesFormat { attributes_to_retrieve, + retrieve_vectors, attributes_to_highlight: None, attributes_to_crop: None, crop_length: DEFAULT_CROP_LENGTH(), diff --git a/meilisearch/tests/search/hybrid.rs b/meilisearch/tests/search/hybrid.rs index 9c50df6e1..0c8b4534c 100644 --- a/meilisearch/tests/search/hybrid.rs +++ b/meilisearch/tests/search/hybrid.rs @@ -124,7 +124,7 @@ async fn simple_search() { let (response, code) = index .search_post( - json!({"q": "Captain", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.2}}), + json!({"q": "Captain", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.2}, "retrieveVectors": true}), ) .await; snapshot!(code, @"200 OK"); @@ -133,7 +133,7 @@ async fn simple_search() { let (response, code) = index .search_post( - json!({"q": "Captain", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.5}, "showRankingScore": true}), + json!({"q": "Captain", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.5}, "showRankingScore": true, "retrieveVectors": true}), ) .await; snapshot!(code, @"200 OK"); @@ -142,7 +142,7 @@ async fn simple_search() { let (response, code) = index .search_post( - json!({"q": "Captain", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.8}, "showRankingScore": true}), + json!({"q": "Captain", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.8}, "showRankingScore": true, "retrieveVectors": true}), ) .await; snapshot!(code, @"200 OK"); diff --git a/meilisearch/tests/similar/mod.rs b/meilisearch/tests/similar/mod.rs index bde23b67f..a2378eb58 100644 --- a/meilisearch/tests/similar/mod.rs +++ b/meilisearch/tests/similar/mod.rs @@ -557,7 +557,7 @@ async fn limit_and_offset() { index.wait_task(value.uid()).await; index - .similar(json!({"id": 143, "limit": 1}), |response, code| { + .similar(json!({"id": 143, "limit": 1, "retrieveVectors": true}), |response, code| { snapshot!(code, @"200 OK"); snapshot!(json_string!(response["hits"]), @r###" [ @@ -567,9 +567,9 @@ async fn limit_and_offset() { "id": "522681", "_vectors": { "manual": [ - 0.1, - 0.6, - 0.8 + 0.10000000149011612, + 0.6000000238418579, + 0.800000011920929 ] } } diff --git a/milli/src/vector/rest.rs b/milli/src/vector/rest.rs index 60f54782e..e7fc509b3 100644 --- a/milli/src/vector/rest.rs +++ b/milli/src/vector/rest.rs @@ -163,6 +163,7 @@ impl Embedder { text_chunks: Vec>, threads: &ThreadPoolNoAbort, ) -> Result>>, EmbedError> { + dbg!(&text_chunks); threads .install(move || { text_chunks.into_par_iter().map(move |chunk| self.embed(chunk)).collect() @@ -230,6 +231,7 @@ where input_value } [input] => { + dbg!(&options); let mut body = options.query.clone(); body.as_object_mut() From 9eb6f522ea62e6dd06cedd8ee553b0d7101e1d1a Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 30 May 2024 11:50:30 +0200 Subject: [PATCH 032/110] wraps the index embedding config in a struct --- Cargo.lock | 4 +- index-scheduler/src/lib.rs | 77 +++++++++++-------- meilisearch-types/src/settings.rs | 3 +- milli/src/index.rs | 25 +++--- milli/src/update/index_documents/mod.rs | 4 +- .../src/update/index_documents/typed_chunk.rs | 18 +++-- milli/src/update/settings.rs | 56 ++++++++------ 7 files changed, 112 insertions(+), 75 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3b28a00e3..b00e94072 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5310,9 +5310,9 @@ dependencies = [ [[package]] name = "tracing-actix-web" -version = "0.7.10" +version = "0.7.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa069bd1503dd526ee793bb3fce408895136c95fc86d2edb2acf1c646d7f0684" +checksum = "4ee9e39a66d9b615644893ffc1704d2a89b5b315b7fd0228ad3182ca9a306b19" dependencies = [ "actix-web", "mutually_exclusive_features", diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index c76a207f5..d007acd2c 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -53,6 +53,7 @@ use meilisearch_types::heed::byteorder::BE; use meilisearch_types::heed::types::{SerdeBincode, SerdeJson, Str, I128}; use meilisearch_types::heed::{self, Database, Env, PutFlags, RoTxn, RwTxn}; use meilisearch_types::milli::documents::DocumentsBatchBuilder; +use meilisearch_types::milli::index::IndexEmbeddingConfig; use meilisearch_types::milli::update::IndexerConfig; use meilisearch_types::milli::vector::{Embedder, EmbedderOptions, EmbeddingConfigs}; use meilisearch_types::milli::{self, CboRoaringBitmapCodec, Index, RoaringBitmapCodec, BEU32}; @@ -1459,33 +1460,39 @@ impl IndexScheduler { // TODO: consider using a type alias or a struct embedder/template pub fn embedders( &self, - embedding_configs: Vec<(String, milli::vector::EmbeddingConfig, RoaringBitmap)>, + embedding_configs: Vec, ) -> Result { let res: Result<_> = embedding_configs .into_iter() - .map(|(name, milli::vector::EmbeddingConfig { embedder_options, prompt }, _)| { - let prompt = - Arc::new(prompt.try_into().map_err(meilisearch_types::milli::Error::from)?); - // optimistically return existing embedder - { - let embedders = self.embedders.read().unwrap(); - if let Some(embedder) = embedders.get(&embedder_options) { - return Ok((name, (embedder.clone(), prompt))); + .map( + |IndexEmbeddingConfig { + name, + config: milli::vector::EmbeddingConfig { embedder_options, prompt }, + .. + }| { + let prompt = + Arc::new(prompt.try_into().map_err(meilisearch_types::milli::Error::from)?); + // optimistically return existing embedder + { + let embedders = self.embedders.read().unwrap(); + if let Some(embedder) = embedders.get(&embedder_options) { + return Ok((name, (embedder.clone(), prompt))); + } } - } - // add missing embedder - let embedder = Arc::new( - Embedder::new(embedder_options.clone()) - .map_err(meilisearch_types::milli::vector::Error::from) - .map_err(meilisearch_types::milli::Error::from)?, - ); - { - let mut embedders = self.embedders.write().unwrap(); - embedders.insert(embedder_options, embedder.clone()); - } - Ok((name, (embedder, prompt))) - }) + // add missing embedder + let embedder = Arc::new( + Embedder::new(embedder_options.clone()) + .map_err(meilisearch_types::milli::vector::Error::from) + .map_err(meilisearch_types::milli::Error::from)?, + ); + { + let mut embedders = self.embedders.write().unwrap(); + embedders.insert(embedder_options, embedder.clone()); + } + Ok((name, (embedder, prompt))) + }, + ) .collect(); res.map(EmbeddingConfigs::new) } @@ -3055,10 +3062,10 @@ mod tests { let rtxn = index.read_txn().unwrap(); let configs = index.embedding_configs(&rtxn).unwrap(); - let (name, embedding_config, user_provided) = configs.first().unwrap(); + let IndexEmbeddingConfig { name, config, user_defined } = configs.first().unwrap(); insta::assert_snapshot!(name, @"default"); - insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>"); - insta::assert_json_snapshot!(embedding_config.embedder_options); + insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[]>"); + insta::assert_json_snapshot!(config.embedder_options); } #[test] @@ -5022,15 +5029,17 @@ mod tests { let configs = index.embedding_configs(&rtxn).unwrap(); // for consistency with the below #[allow(clippy::get_first)] - let (name, fakerest_config, user_provided) = configs.get(0).unwrap(); + let IndexEmbeddingConfig { name, config: fakerest_config, user_defined } = + configs.get(0).unwrap(); insta::assert_snapshot!(name, @"A_fakerest"); - insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>"); + insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[]>"); insta::assert_json_snapshot!(fakerest_config.embedder_options); let fakerest_name = name.clone(); - let (name, simple_hf_config, user_provided) = configs.get(1).unwrap(); + let IndexEmbeddingConfig { name, config: simple_hf_config, user_defined } = + configs.get(1).unwrap(); insta::assert_snapshot!(name, @"B_small_hf"); - insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>"); + insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[]>"); insta::assert_json_snapshot!(simple_hf_config.embedder_options); let simple_hf_name = name.clone(); @@ -5102,11 +5111,11 @@ mod tests { let configs = index.embedding_configs(&rtxn).unwrap(); // for consistency with the below #[allow(clippy::get_first)] - let (name, _config, user_defined) = configs.get(0).unwrap(); + let IndexEmbeddingConfig { name, config: _, user_defined } = configs.get(0).unwrap(); insta::assert_snapshot!(name, @"A_fakerest"); insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[0]>"); - let (name, _config, user_defined) = configs.get(1).unwrap(); + let IndexEmbeddingConfig { name, config: _, user_defined } = configs.get(1).unwrap(); insta::assert_snapshot!(name, @"B_small_hf"); insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[]>"); @@ -5178,11 +5187,13 @@ mod tests { let configs = index.embedding_configs(&rtxn).unwrap(); // for consistency with the below #[allow(clippy::get_first)] - let (name, _config, user_defined) = configs.get(0).unwrap(); + let IndexEmbeddingConfig { name, config: _, user_defined } = + configs.get(0).unwrap(); insta::assert_snapshot!(name, @"A_fakerest"); insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[0]>"); - let (name, _config, user_defined) = configs.get(1).unwrap(); + let IndexEmbeddingConfig { name, config: _, user_defined } = + configs.get(1).unwrap(); insta::assert_snapshot!(name, @"B_small_hf"); insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[]>"); diff --git a/meilisearch-types/src/settings.rs b/meilisearch-types/src/settings.rs index d1d82be68..8a9708d29 100644 --- a/meilisearch-types/src/settings.rs +++ b/meilisearch-types/src/settings.rs @@ -8,6 +8,7 @@ use std::str::FromStr; use deserr::{DeserializeError, Deserr, ErrorKind, MergeWithError, ValuePointerRef}; use fst::IntoStreamer; +use milli::index::IndexEmbeddingConfig; use milli::proximity::ProximityPrecision; use milli::update::Setting; use milli::{Criterion, CriterionError, Index, DEFAULT_VALUES_PER_FACET}; @@ -672,7 +673,7 @@ pub fn settings( let embedders: BTreeMap<_, _> = index .embedding_configs(rtxn)? .into_iter() - .map(|(name, config, _)| (name, Setting::Set(config.into()))) + .map(|IndexEmbeddingConfig { name, config, .. }| (name, Setting::Set(config.into()))) .collect(); let embedders = if embedders.is_empty() { Setting::NotSet } else { Setting::Set(embedders) }; diff --git a/milli/src/index.rs b/milli/src/index.rs index 569a9a692..a47c07e08 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -9,6 +9,7 @@ use heed::types::*; use heed::{CompactionOption, Database, RoTxn, RwTxn, Unspecified}; use roaring::RoaringBitmap; use rstar::RTree; +use serde::{Deserialize, Serialize}; use time::OffsetDateTime; use crate::documents::PrimaryKey; @@ -1579,24 +1580,23 @@ impl Index { pub(crate) fn put_embedding_configs( &self, wtxn: &mut RwTxn<'_>, - configs: Vec<(String, EmbeddingConfig, RoaringBitmap)>, + configs: Vec, ) -> heed::Result<()> { - self.main - .remap_types::>>() - .put(wtxn, main_key::EMBEDDING_CONFIGS, &configs) + self.main.remap_types::>>().put( + wtxn, + main_key::EMBEDDING_CONFIGS, + &configs, + ) } pub(crate) fn delete_embedding_configs(&self, wtxn: &mut RwTxn<'_>) -> heed::Result { self.main.remap_key_type::().delete(wtxn, main_key::EMBEDDING_CONFIGS) } - pub fn embedding_configs( - &self, - rtxn: &RoTxn<'_>, - ) -> Result> { + pub fn embedding_configs(&self, rtxn: &RoTxn<'_>) -> Result> { Ok(self .main - .remap_types::>>() + .remap_types::>>() .get(rtxn, main_key::EMBEDDING_CONFIGS)? .unwrap_or_default()) } @@ -1668,6 +1668,13 @@ impl Index { } } +#[derive(Debug, Deserialize, Serialize)] +pub struct IndexEmbeddingConfig { + pub name: String, + pub config: EmbeddingConfig, + pub user_defined: RoaringBitmap, +} + #[cfg(test)] pub(crate) mod tests { use std::collections::HashSet; diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index a03e4333e..2dc93f67a 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -785,6 +785,7 @@ mod tests { use super::*; use crate::documents::documents_batch_reader_from_objects; use crate::index::tests::TempIndex; + use crate::index::IndexEmbeddingConfig; use crate::search::TermsMatchingStrategy; use crate::update::Setting; use crate::{db_snap, Filter, Search}; @@ -2620,7 +2621,8 @@ mod tests { let rtxn = index.read_txn().unwrap(); let mut embedding_configs = index.embedding_configs(&rtxn).unwrap(); - let (embedder_name, embedder, user_defined) = embedding_configs.pop().unwrap(); + let IndexEmbeddingConfig { name: embedder_name, config: embedder, user_defined } = + embedding_configs.pop().unwrap(); insta::assert_snapshot!(embedder_name, @"manual"); insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[0, 1, 2]>"); let embedder = diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 2c4e17858..078010554 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -20,6 +20,7 @@ use super::MergeFn; use crate::external_documents_ids::{DocumentOperation, DocumentOperationKind}; use crate::facet::FacetType; use crate::index::db_name::DOCUMENTS; +use crate::index::IndexEmbeddingConfig; use crate::proximity::MAX_DISTANCE; use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvReaderDelAdd}; use crate::update::facet::FacetsUpdate; @@ -156,8 +157,11 @@ pub(crate) fn write_typed_chunk_into_index( let mut docids = index.documents_ids(wtxn)?; let mut iter = merger.into_stream_merger_iter()?; - let embedders: BTreeSet<_> = - index.embedding_configs(wtxn)?.into_iter().map(|(name, _, _)| name).collect(); + let embedders: BTreeSet<_> = index + .embedding_configs(wtxn)? + .into_iter() + .map(|IndexEmbeddingConfig { name, .. }| name) + .collect(); let mut vectors_buffer = Vec::new(); while let Some((key, reader)) = iter.next()? { let mut writer: KvWriter<_, FieldId> = KvWriter::memory(); @@ -653,10 +657,12 @@ pub(crate) fn write_typed_chunk_into_index( let Some((expected_dimension, embedder_name)) = params else { unreachable!() }; let mut embedding_configs = index.embedding_configs(&wtxn)?; - let (_name, _conf, ud) = - embedding_configs.iter_mut().find(|config| config.0 == embedder_name).unwrap(); - *ud -= remove_from_user_defined; - *ud |= user_defined; + let index_embedder_config = embedding_configs + .iter_mut() + .find(|IndexEmbeddingConfig { name, .. }| name == &embedder_name) + .unwrap(); + index_embedder_config.user_defined -= remove_from_user_defined; + index_embedder_config.user_defined |= user_defined; index.put_embedding_configs(wtxn, embedding_configs)?; diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 64998bcc3..6b07e614e 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -15,7 +15,9 @@ use super::index_documents::{IndexDocumentsConfig, Transform}; use super::IndexerConfig; use crate::criterion::Criterion; use crate::error::UserError; -use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS}; +use crate::index::{ + IndexEmbeddingConfig, DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS, +}; use crate::order_by_map::OrderByMap; use crate::proximity::ProximityPrecision; use crate::update::index_documents::IndexDocumentsMethod; @@ -930,8 +932,8 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { let old_configs: BTreeMap, RoaringBitmap)> = old_configs .into_iter() - .map(|(name, setting, user_defined)| { - (name, (Setting::Set(setting.into()), user_defined)) + .map(|IndexEmbeddingConfig { name, config, user_defined }| { + (name, (Setting::Set(config.into()), user_defined)) }) .collect(); @@ -975,23 +977,27 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { } } } - let new_configs: Vec<(String, EmbeddingConfig, RoaringBitmap)> = new_configs + let new_configs: Vec = new_configs .into_iter() - .filter_map(|(name, (setting, user_defined))| match setting { - Setting::Set(settings) => Some((name, settings.into(), user_defined)), - Setting::Reset => None, - Setting::NotSet => { - Some((name, EmbeddingSettings::default().into(), user_defined)) + .filter_map(|(name, (config, user_defined))| match config { + Setting::Set(config) => { + Some(IndexEmbeddingConfig { name, config: config.into(), user_defined }) } + Setting::Reset => None, + Setting::NotSet => Some(IndexEmbeddingConfig { + name, + config: EmbeddingSettings::default().into(), + user_defined, + }), }) .collect(); self.index.embedder_category_id.clear(self.wtxn)?; - for (index, (embedder_name, _, _)) in new_configs.iter().enumerate() { + for (index, index_embedding_config) in new_configs.iter().enumerate() { self.index.embedder_category_id.put_with_flags( self.wtxn, heed::PutFlags::APPEND, - embedder_name, + &index_embedding_config.name, &index .try_into() .map_err(|_| UserError::TooManyEmbedders(new_configs.len()))?, @@ -1371,21 +1377,25 @@ impl InnerIndexSettings { } } -fn embedders( - embedding_configs: Vec<(String, EmbeddingConfig, RoaringBitmap)>, -) -> Result { +fn embedders(embedding_configs: Vec) -> Result { let res: Result<_> = embedding_configs .into_iter() - .map(|(name, EmbeddingConfig { embedder_options, prompt }, _)| { - let prompt = Arc::new(prompt.try_into().map_err(crate::Error::from)?); + .map( + |IndexEmbeddingConfig { + name, + config: EmbeddingConfig { embedder_options, prompt }, + .. + }| { + let prompt = Arc::new(prompt.try_into().map_err(crate::Error::from)?); - let embedder = Arc::new( - Embedder::new(embedder_options.clone()) - .map_err(crate::vector::Error::from) - .map_err(crate::Error::from)?, - ); - Ok((name, (embedder, prompt))) - }) + let embedder = Arc::new( + Embedder::new(embedder_options.clone()) + .map_err(crate::vector::Error::from) + .map_err(crate::Error::from)?, + ); + Ok((name, (embedder, prompt))) + }, + ) .collect(); res.map(EmbeddingConfigs::new) } From a73ccc78a6e9db032d0195dd3347e56cbd8f2735 Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 30 May 2024 12:16:06 +0200 Subject: [PATCH 033/110] forward the embedding config to the extractors --- .../index_documents/extract/extract_vector_points.rs | 2 ++ milli/src/update/index_documents/extract/mod.rs | 12 +++++++++++- milli/src/update/index_documents/mod.rs | 2 ++ 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/milli/src/update/index_documents/extract/extract_vector_points.rs b/milli/src/update/index_documents/extract/extract_vector_points.rs index d97d1403c..3eb761bce 100644 --- a/milli/src/update/index_documents/extract/extract_vector_points.rs +++ b/milli/src/update/index_documents/extract/extract_vector_points.rs @@ -14,6 +14,7 @@ use roaring::RoaringBitmap; use serde_json::Value; use super::helpers::{create_writer, writer_into_reader, GrenadParameters}; +use crate::index::IndexEmbeddingConfig; use crate::prompt::Prompt; use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; use crate::update::settings::InnerIndexSettingsDiff; @@ -96,6 +97,7 @@ struct EmbedderVectorExtractor { pub fn extract_vector_points( obkv_documents: grenad::Reader, indexer: GrenadParameters, + embedders_configs: &[IndexEmbeddingConfig], settings_diff: &InnerIndexSettingsDiff, ) -> Result> { let reindex_vectors = settings_diff.reindex_vectors(); diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 80214e7c8..6399b40f8 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -30,6 +30,7 @@ use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids use self::extract_word_position_docids::extract_word_position_docids; use super::helpers::{as_cloneable_grenad, CursorClonableMmap, GrenadParameters}; use super::{helpers, TypedChunk}; +use crate::index::IndexEmbeddingConfig; use crate::update::settings::InnerIndexSettingsDiff; use crate::{FieldId, Result, ThreadPoolNoAbortBuilder}; @@ -43,6 +44,7 @@ pub(crate) fn data_from_obkv_documents( indexer: GrenadParameters, lmdb_writer_sx: Sender>, primary_key_id: FieldId, + embedders_configs: Arc>, settings_diff: Arc, max_positions_per_attributes: Option, ) -> Result<()> { @@ -55,6 +57,7 @@ pub(crate) fn data_from_obkv_documents( original_documents_chunk, indexer, lmdb_writer_sx.clone(), + embedders_configs.clone(), settings_diff.clone(), ) }) @@ -210,6 +213,7 @@ fn send_original_documents_data( original_documents_chunk: Result>>, indexer: GrenadParameters, lmdb_writer_sx: Sender>, + embedders_configs: Arc>, settings_diff: Arc, ) -> Result<()> { let original_documents_chunk = @@ -226,11 +230,17 @@ fn send_original_documents_data( if index_vectors { let settings_diff = settings_diff.clone(); + let embedders_configs = embedders_configs.clone(); let original_documents_chunk = original_documents_chunk.clone(); let lmdb_writer_sx = lmdb_writer_sx.clone(); rayon::spawn(move || { - match extract_vector_points(original_documents_chunk.clone(), indexer, &settings_diff) { + match extract_vector_points( + original_documents_chunk.clone(), + indexer, + &embedders_configs, + &settings_diff, + ) { Ok(extracted_vectors) => { for ExtractedVectorPoints { manual_vectors, diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 2dc93f67a..907554753 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -286,6 +286,7 @@ where settings_diff.new.recompute_searchables(self.wtxn, self.index)?; let settings_diff = Arc::new(settings_diff); + let embedders_configs = Arc::new(self.index.embedding_configs(self.wtxn)?); let backup_pool; let pool = match self.indexer_config.thread_pool { @@ -399,6 +400,7 @@ where pool_params, lmdb_writer_sx.clone(), primary_key_id, + embedders_configs.clone(), settings_diff_cloned, max_positions_per_attributes, ) From 5d50850e12f72a07221184c7d9962f511a6dc791 Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 3 Jun 2024 16:04:14 +0200 Subject: [PATCH 034/110] always push the user defined vectors in arroy --- index-scheduler/src/lib.rs | 14 +- ..._scheduler__tests__import_vectors-15.snap} | 4 - ..._scheduler__tests__import_vectors-22.snap} | 4 - ...x_scheduler__tests__import_vectors-5.snap} | 0 ...x_scheduler__tests__import_vectors-8.snap} | 0 ..._scheduler__tests__settings_update-5.snap} | 0 .../documents after setting an embedder.snap | 4 - meilisearch/tests/search/hybrid.rs | 40 +-- meilisearch/tests/search/mod.rs | 1 + meilisearch/tests/similar/mod.rs | 217 ++++++++-------- ...__attribute_fid__attribute_fid_ngrams.snap | 244 ------------------ .../1/field_distribution.snap | 7 - .../field_distribution.snap | 7 - .../extract/extract_vector_points.rs | 75 +++--- milli/src/vector/parsed_vectors.rs | 22 +- 15 files changed, 189 insertions(+), 450 deletions(-) rename index-scheduler/src/snapshots/{index_scheduler__tests__import_vectors-9.snap => index_scheduler__tests__import_vectors-15.snap} (67%) rename index-scheduler/src/snapshots/{index_scheduler__tests__import_vectors-12.snap => index_scheduler__tests__import_vectors-22.snap} (67%) rename index-scheduler/src/snapshots/{index_scheduler__tests__import_vectors-4.snap => index_scheduler__tests__import_vectors-5.snap} (100%) rename index-scheduler/src/snapshots/{index_scheduler__tests__import_vectors-6.snap => index_scheduler__tests__import_vectors-8.snap} (100%) rename index-scheduler/src/snapshots/{index_scheduler__tests__settings_update-3.snap => index_scheduler__tests__settings_update-5.snap} (100%) delete mode 100644 index-scheduler/src/snapshots/lib.rs/import_vectors_first_and_embedder_later/documents after setting an embedder.snap delete mode 100644 milli/src/search/new/tests/snapshots/milli__search__new__tests__attribute_fid__attribute_fid_ngrams.snap delete mode 100644 milli/src/snapshots/index.rs/initial_field_distribution/1/field_distribution.snap delete mode 100644 milli/src/snapshots/index.rs/initial_field_distribution/field_distribution.snap diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index d007acd2c..f69736297 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -5173,8 +5173,8 @@ mod tests { snapshot!(snapshot_index_scheduler(&index_scheduler), name: "Intel to kefir"); println!("HEEEEERE"); - // handle.advance_one_successful_batch(); - handle.advance_one_failed_batch(); + handle.advance_one_successful_batch(); + // handle.advance_one_failed_batch(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "Intel to kefir succeeds"); { @@ -5351,9 +5351,9 @@ mod tests { // as user provided since we explicitely marked it as NOT user provided. snapshot!(format!("{conf:#?}"), @r###" [ - ( - "my_doggo_embedder", - EmbeddingConfig { + IndexEmbeddingConfig { + name: "my_doggo_embedder", + config: EmbeddingConfig { embedder_options: HuggingFace( EmbedderOptions { model: "sentence-transformers/all-MiniLM-L6-v2", @@ -5367,8 +5367,8 @@ mod tests { template: "{{doc.doggo}}", }, }, - RoaringBitmap<[1, 2]>, - ), + user_defined: RoaringBitmap<[1, 2]>, + }, ] "###); let docid = index.external_documents_ids.get(&rtxn, "0").unwrap().unwrap(); diff --git a/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-9.snap b/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-15.snap similarity index 67% rename from index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-9.snap rename to index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-15.snap index 002a42e59..540835dfb 100644 --- a/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-9.snap +++ b/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-15.snap @@ -6,10 +6,6 @@ expression: doc "doggo": "Intel", "breed": "beagle", "_vectors": { - "A_fakerest": { - "embeddings": "[vector]", - "userProvided": true - }, "noise": [ 0.1, 0.2, diff --git a/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-12.snap b/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-22.snap similarity index 67% rename from index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-12.snap rename to index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-22.snap index 718ea229c..bc35d84f6 100644 --- a/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-12.snap +++ b/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-22.snap @@ -6,10 +6,6 @@ expression: doc "doggo": "kefir", "breed": "patou", "_vectors": { - "A_fakerest": { - "embeddings": "[vector]", - "userProvided": true - }, "noise": [ 0.1, 0.2, diff --git a/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-4.snap b/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-5.snap similarity index 100% rename from index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-4.snap rename to index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-5.snap diff --git a/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-6.snap b/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-8.snap similarity index 100% rename from index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-6.snap rename to index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-8.snap diff --git a/index-scheduler/src/snapshots/index_scheduler__tests__settings_update-3.snap b/index-scheduler/src/snapshots/index_scheduler__tests__settings_update-5.snap similarity index 100% rename from index-scheduler/src/snapshots/index_scheduler__tests__settings_update-3.snap rename to index-scheduler/src/snapshots/index_scheduler__tests__settings_update-5.snap diff --git a/index-scheduler/src/snapshots/lib.rs/import_vectors_first_and_embedder_later/documents after setting an embedder.snap b/index-scheduler/src/snapshots/lib.rs/import_vectors_first_and_embedder_later/documents after setting an embedder.snap deleted file mode 100644 index 853be8b0a..000000000 --- a/index-scheduler/src/snapshots/lib.rs/import_vectors_first_and_embedder_later/documents after setting an embedder.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: index-scheduler/src/lib.rs ---- -[{"id":0,"doggo":"kefir"},{"id":1,"doggo":"intel","_vectors":{"unknown embedder":[1.0,2.0,3.0]}},{"id":2,"doggo":"max","_vectors":{"unknown embedder":[4.0,5.0]}},{"id":3,"doggo":"marcel"},{"id":4,"doggo":"sora"}] diff --git a/meilisearch/tests/search/hybrid.rs b/meilisearch/tests/search/hybrid.rs index 0c8b4534c..1e415bc63 100644 --- a/meilisearch/tests/search/hybrid.rs +++ b/meilisearch/tests/search/hybrid.rs @@ -204,7 +204,7 @@ async fn distribution_shift() { let server = Server::new().await; let index = index_with_documents_user_provided(&server, &SIMPLE_SEARCH_DOCUMENTS_VEC).await; - let search = json!({"q": "Captain", "vector": [1.0, 1.0], "showRankingScore": true, "hybrid": {"semanticRatio": 1.0}}); + let search = json!({"q": "Captain", "vector": [1.0, 1.0], "showRankingScore": true, "hybrid": {"semanticRatio": 1.0}, "retrieveVectors": true}); let (response, code) = index.search_post(search.clone()).await; snapshot!(code, @"200 OK"); snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":0.9472135901451112}]"###); @@ -239,20 +239,23 @@ async fn highlighter() { let (response, code) = index .search_post(json!({"q": "Captain Marvel", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.2}, - "attributesToHighlight": [ - "desc" + "retrieveVectors": true, + "attributesToHighlight": [ + "desc", + "_vectors", ], - "highlightPreTag": "**BEGIN**", - "highlightPostTag": "**END**" + "highlightPreTag": "**BEGIN**", + "highlightPostTag": "**END**", })) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":["2.0","3.0"]}}},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_formatted":{"title":"Shazam!","desc":"a **BEGIN**Captain**END** **BEGIN**Marvel**END** ersatz","id":"1","_vectors":{"default":["1.0","3.0"]}}},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_formatted":{"title":"Captain Planet","desc":"He's not part of the **BEGIN**Marvel**END** Cinematic Universe","id":"2","_vectors":{"default":["1.0","2.0"]}}}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"}},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_formatted":{"title":"Shazam!","desc":"a **BEGIN**Captain**END** **BEGIN**Marvel**END** ersatz","id":"1"}},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_formatted":{"title":"Captain Planet","desc":"He's not part of the **BEGIN**Marvel**END** Cinematic Universe","id":"2"}}]"###); snapshot!(response["semanticHitCount"], @"0"); let (response, code) = index .search_post(json!({"q": "Captain Marvel", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.8}, + "retrieveVectors": true, "showRankingScore": true, "attributesToHighlight": [ "desc" @@ -262,13 +265,14 @@ async fn highlighter() { })) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":["2.0","3.0"]}},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_formatted":{"title":"Captain Planet","desc":"He's not part of the **BEGIN**Marvel**END** Cinematic Universe","id":"2","_vectors":{"default":["1.0","2.0"]}},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_formatted":{"title":"Shazam!","desc":"a **BEGIN**Captain**END** **BEGIN**Marvel**END** ersatz","id":"1","_vectors":{"default":["1.0","3.0"]}},"_rankingScore":0.9472135901451112}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_formatted":{"title":"Captain Planet","desc":"He's not part of the **BEGIN**Marvel**END** Cinematic Universe","id":"2"},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_formatted":{"title":"Shazam!","desc":"a **BEGIN**Captain**END** **BEGIN**Marvel**END** ersatz","id":"1"},"_rankingScore":0.9472135901451112}]"###); snapshot!(response["semanticHitCount"], @"3"); // no highlighting on full semantic let (response, code) = index .search_post(json!({"q": "Captain Marvel", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 1.0}, + "retrieveVectors": true, "showRankingScore": true, "attributesToHighlight": [ "desc" @@ -278,7 +282,7 @@ async fn highlighter() { })) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":["2.0","3.0"]}},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_formatted":{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":["1.0","2.0"]}},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_formatted":{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":["1.0","3.0"]}},"_rankingScore":0.9472135901451112}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_formatted":{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2"},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_formatted":{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1"},"_rankingScore":0.9472135901451112}]"###); snapshot!(response["semanticHitCount"], @"3"); } @@ -361,7 +365,7 @@ async fn single_document() { let (response, code) = index .search_post( - json!({"vector": [1.0, 3.0], "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true}), + json!({"vector": [1.0, 3.0], "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true, "retrieveVectors": true}), ) .await; @@ -377,7 +381,7 @@ async fn query_combination() { // search without query and vector, but with hybrid => still placeholder let (response, code) = index - .search_post(json!({"hybrid": {"semanticRatio": 1.0}, "showRankingScore": true})) + .search_post(json!({"hybrid": {"semanticRatio": 1.0}, "showRankingScore": true, "retrieveVectors": true})) .await; snapshot!(code, @"200 OK"); @@ -386,7 +390,7 @@ async fn query_combination() { // same with a different semantic ratio let (response, code) = index - .search_post(json!({"hybrid": {"semanticRatio": 0.76}, "showRankingScore": true})) + .search_post(json!({"hybrid": {"semanticRatio": 0.76}, "showRankingScore": true, "retrieveVectors": true})) .await; snapshot!(code, @"200 OK"); @@ -395,7 +399,7 @@ async fn query_combination() { // wrong vector dimensions let (response, code) = index - .search_post(json!({"vector": [1.0, 0.0, 1.0], "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true})) + .search_post(json!({"vector": [1.0, 0.0, 1.0], "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true, "retrieveVectors": true})) .await; snapshot!(code, @"400 Bad Request"); @@ -410,7 +414,7 @@ async fn query_combination() { // full vector let (response, code) = index - .search_post(json!({"vector": [1.0, 0.0], "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true})) + .search_post(json!({"vector": [1.0, 0.0], "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true, "retrieveVectors": true})) .await; snapshot!(code, @"200 OK"); @@ -419,7 +423,7 @@ async fn query_combination() { // full keyword, without a query let (response, code) = index - .search_post(json!({"vector": [1.0, 0.0], "hybrid": {"semanticRatio": 0.0}, "showRankingScore": true})) + .search_post(json!({"vector": [1.0, 0.0], "hybrid": {"semanticRatio": 0.0}, "showRankingScore": true, "retrieveVectors": true})) .await; snapshot!(code, @"200 OK"); @@ -428,7 +432,7 @@ async fn query_combination() { // query + vector, full keyword => keyword let (response, code) = index - .search_post(json!({"q": "Captain", "vector": [1.0, 0.0], "hybrid": {"semanticRatio": 0.0}, "showRankingScore": true})) + .search_post(json!({"q": "Captain", "vector": [1.0, 0.0], "hybrid": {"semanticRatio": 0.0}, "showRankingScore": true, "retrieveVectors": true})) .await; snapshot!(code, @"200 OK"); @@ -437,7 +441,7 @@ async fn query_combination() { // query + vector, no hybrid keyword => let (response, code) = index - .search_post(json!({"q": "Captain", "vector": [1.0, 0.0], "showRankingScore": true})) + .search_post(json!({"q": "Captain", "vector": [1.0, 0.0], "showRankingScore": true, "retrieveVectors": true})) .await; snapshot!(code, @"400 Bad Request"); @@ -453,7 +457,7 @@ async fn query_combination() { // full vector, without a vector => error let (response, code) = index .search_post( - json!({"q": "Captain", "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true}), + json!({"q": "Captain", "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true, "retrieveVectors": true}), ) .await; @@ -470,7 +474,7 @@ async fn query_combination() { // hybrid without a vector => full keyword let (response, code) = index .search_post( - json!({"q": "Planet", "hybrid": {"semanticRatio": 0.99}, "showRankingScore": true}), + json!({"q": "Planet", "hybrid": {"semanticRatio": 0.99}, "showRankingScore": true, "retrieveVectors": true}), ) .await; diff --git a/meilisearch/tests/search/mod.rs b/meilisearch/tests/search/mod.rs index b65c0dc42..955b324a6 100644 --- a/meilisearch/tests/search/mod.rs +++ b/meilisearch/tests/search/mod.rs @@ -1337,6 +1337,7 @@ async fn experimental_feature_vector_store() { .search_post(json!({ "vector": [1.0, 2.0, 3.0], "showRankingScore": true, + "retrieveVectors": true, })) .await; diff --git a/meilisearch/tests/similar/mod.rs b/meilisearch/tests/similar/mod.rs index a2378eb58..f2af91588 100644 --- a/meilisearch/tests/similar/mod.rs +++ b/meilisearch/tests/similar/mod.rs @@ -78,7 +78,7 @@ async fn basic() { index.wait_task(value.uid()).await; index - .similar(json!({"id": 143}), |response, code| { + .similar(json!({"id": 143, "retrieveVectors": true}), |response, code| { snapshot!(code, @"200 OK"); snapshot!(json_string!(response["hits"]), @r###" [ @@ -88,9 +88,9 @@ async fn basic() { "id": "522681", "_vectors": { "manual": [ - 0.1, - 0.6, - 0.8 + 0.10000000149011612, + 0.6000000238418579, + 0.800000011920929 ] } }, @@ -100,9 +100,9 @@ async fn basic() { "id": "299537", "_vectors": { "manual": [ - 0.6, - 0.8, - -0.2 + 0.6000000238418579, + 0.800000011920929, + -0.20000000298023224 ] } }, @@ -112,9 +112,9 @@ async fn basic() { "id": "166428", "_vectors": { "manual": [ - 0.7, - 0.7, - -0.4 + 0.699999988079071, + 0.699999988079071, + -0.4000000059604645 ] } }, @@ -124,8 +124,8 @@ async fn basic() { "id": "287947", "_vectors": { "manual": [ - 0.8, - 0.4, + 0.800000011920929, + 0.4000000059604645, -0.5 ] } @@ -136,7 +136,7 @@ async fn basic() { .await; index - .similar(json!({"id": "299537"}), |response, code| { + .similar(json!({"id": "299537", "retrieveVectors": true}), |response, code| { snapshot!(code, @"200 OK"); snapshot!(json_string!(response["hits"]), @r###" [ @@ -146,9 +146,9 @@ async fn basic() { "id": "166428", "_vectors": { "manual": [ - 0.7, - 0.7, - -0.4 + 0.699999988079071, + 0.699999988079071, + -0.4000000059604645 ] } }, @@ -158,8 +158,8 @@ async fn basic() { "id": "287947", "_vectors": { "manual": [ - 0.8, - 0.4, + 0.800000011920929, + 0.4000000059604645, -0.5 ] } @@ -170,9 +170,9 @@ async fn basic() { "id": "522681", "_vectors": { "manual": [ - 0.1, - 0.6, - 0.8 + 0.10000000149011612, + 0.6000000238418579, + 0.800000011920929 ] } }, @@ -183,8 +183,8 @@ async fn basic() { "_vectors": { "manual": [ -0.5, - 0.3, - 0.85 + 0.30000001192092896, + 0.8500000238418579 ] } } @@ -456,71 +456,77 @@ async fn filter() { index.wait_task(value.uid()).await; index - .similar(json!({"id": 522681, "filter": "release_year = 2019"}), |response, code| { - snapshot!(code, @"200 OK"); - snapshot!(json_string!(response["hits"]), @r###" - [ - { - "title": "Captain Marvel", - "release_year": 2019, - "id": "299537", - "_vectors": { - "manual": [ - 0.6, - 0.8, - -0.2 - ] - } - }, - { - "title": "How to Train Your Dragon: The Hidden World", - "release_year": 2019, - "id": "166428", - "_vectors": { - "manual": [ - 0.7, - 0.7, - -0.4 - ] - } - }, - { - "title": "Shazam!", - "release_year": 2019, - "id": "287947", - "_vectors": { - "manual": [ - 0.8, - 0.4, - -0.5 - ] - } - } - ] - "###); - }) + .similar( + json!({"id": 522681, "filter": "release_year = 2019", "retrieveVectors": true}), + |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "title": "Captain Marvel", + "release_year": 2019, + "id": "299537", + "_vectors": { + "manual": [ + 0.6000000238418579, + 0.800000011920929, + -0.20000000298023224 + ] + } + }, + { + "title": "How to Train Your Dragon: The Hidden World", + "release_year": 2019, + "id": "166428", + "_vectors": { + "manual": [ + 0.699999988079071, + 0.699999988079071, + -0.4000000059604645 + ] + } + }, + { + "title": "Shazam!", + "release_year": 2019, + "id": "287947", + "_vectors": { + "manual": [ + 0.800000011920929, + 0.4000000059604645, + -0.5 + ] + } + } + ] + "###); + }, + ) .await; index - .similar(json!({"id": 522681, "filter": "release_year < 2000"}), |response, code| { - snapshot!(code, @"200 OK"); - snapshot!(json_string!(response["hits"]), @r###" - [ - { - "title": "All Quiet on the Western Front", - "release_year": 1930, - "id": "143", - "_vectors": { - "manual": [ - -0.5, - 0.3, - 0.85 - ] - } - } - ] - "###); - }) + .similar( + json!({"id": 522681, "filter": "release_year < 2000", "retrieveVectors": true}), + |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "title": "All Quiet on the Western Front", + "release_year": 1930, + "id": "143", + "_vectors": { + "manual": [ + -0.5, + 0.30000001192092896, + 0.8500000238418579 + ] + } + } + ] + "###); + }, + ) .await; } @@ -579,24 +585,27 @@ async fn limit_and_offset() { .await; index - .similar(json!({"id": 143, "limit": 1, "offset": 1}), |response, code| { - snapshot!(code, @"200 OK"); - snapshot!(json_string!(response["hits"]), @r###" - [ - { - "title": "Captain Marvel", - "release_year": 2019, - "id": "299537", - "_vectors": { - "manual": [ - 0.6, - 0.8, - -0.2 - ] - } - } - ] - "###); - }) + .similar( + json!({"id": 143, "limit": 1, "offset": 1, "retrieveVectors": true}), + |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "title": "Captain Marvel", + "release_year": 2019, + "id": "299537", + "_vectors": { + "manual": [ + 0.6000000238418579, + 0.800000011920929, + -0.20000000298023224 + ] + } + } + ] + "###); + }, + ) .await; } diff --git a/milli/src/search/new/tests/snapshots/milli__search__new__tests__attribute_fid__attribute_fid_ngrams.snap b/milli/src/search/new/tests/snapshots/milli__search__new__tests__attribute_fid__attribute_fid_ngrams.snap deleted file mode 100644 index 930a21626..000000000 --- a/milli/src/search/new/tests/snapshots/milli__search__new__tests__attribute_fid__attribute_fid_ngrams.snap +++ /dev/null @@ -1,244 +0,0 @@ ---- -source: milli/src/search/new/tests/attribute_fid.rs -expression: "format!(\"{document_ids_scores:#?}\")" ---- -[ - ( - 2, - [ - Fid( - Rank { - rank: 19, - max_rank: 19, - }, - ), - Position( - Rank { - rank: 91, - max_rank: 91, - }, - ), - ], - ), - ( - 6, - [ - Fid( - Rank { - rank: 15, - max_rank: 19, - }, - ), - Position( - Rank { - rank: 81, - max_rank: 91, - }, - ), - ], - ), - ( - 5, - [ - Fid( - Rank { - rank: 14, - max_rank: 19, - }, - ), - Position( - Rank { - rank: 79, - max_rank: 91, - }, - ), - ], - ), - ( - 4, - [ - Fid( - Rank { - rank: 13, - max_rank: 19, - }, - ), - Position( - Rank { - rank: 77, - max_rank: 91, - }, - ), - ], - ), - ( - 3, - [ - Fid( - Rank { - rank: 12, - max_rank: 19, - }, - ), - Position( - Rank { - rank: 83, - max_rank: 91, - }, - ), - ], - ), - ( - 9, - [ - Fid( - Rank { - rank: 11, - max_rank: 19, - }, - ), - Position( - Rank { - rank: 75, - max_rank: 91, - }, - ), - ], - ), - ( - 8, - [ - Fid( - Rank { - rank: 10, - max_rank: 19, - }, - ), - Position( - Rank { - rank: 79, - max_rank: 91, - }, - ), - ], - ), - ( - 7, - [ - Fid( - Rank { - rank: 10, - max_rank: 19, - }, - ), - Position( - Rank { - rank: 73, - max_rank: 91, - }, - ), - ], - ), - ( - 11, - [ - Fid( - Rank { - rank: 7, - max_rank: 19, - }, - ), - Position( - Rank { - rank: 77, - max_rank: 91, - }, - ), - ], - ), - ( - 10, - [ - Fid( - Rank { - rank: 6, - max_rank: 19, - }, - ), - Position( - Rank { - rank: 81, - max_rank: 91, - }, - ), - ], - ), - ( - 13, - [ - Fid( - Rank { - rank: 6, - max_rank: 19, - }, - ), - Position( - Rank { - rank: 81, - max_rank: 91, - }, - ), - ], - ), - ( - 12, - [ - Fid( - Rank { - rank: 6, - max_rank: 19, - }, - ), - Position( - Rank { - rank: 78, - max_rank: 91, - }, - ), - ], - ), - ( - 14, - [ - Fid( - Rank { - rank: 5, - max_rank: 19, - }, - ), - Position( - Rank { - rank: 75, - max_rank: 91, - }, - ), - ], - ), - ( - 0, - [ - Fid( - Rank { - rank: 1, - max_rank: 19, - }, - ), - Position( - Rank { - rank: 91, - max_rank: 91, - }, - ), - ], - ), -] diff --git a/milli/src/snapshots/index.rs/initial_field_distribution/1/field_distribution.snap b/milli/src/snapshots/index.rs/initial_field_distribution/1/field_distribution.snap deleted file mode 100644 index 1d1d629e6..000000000 --- a/milli/src/snapshots/index.rs/initial_field_distribution/1/field_distribution.snap +++ /dev/null @@ -1,7 +0,0 @@ ---- -source: milli/src/index.rs ---- -age 1 | -id 2 | -name 2 | - diff --git a/milli/src/snapshots/index.rs/initial_field_distribution/field_distribution.snap b/milli/src/snapshots/index.rs/initial_field_distribution/field_distribution.snap deleted file mode 100644 index 1d1d629e6..000000000 --- a/milli/src/snapshots/index.rs/initial_field_distribution/field_distribution.snap +++ /dev/null @@ -1,7 +0,0 @@ ---- -source: milli/src/index.rs ---- -age 1 | -id 2 | -name 2 | - diff --git a/milli/src/update/index_documents/extract/extract_vector_points.rs b/milli/src/update/index_documents/extract/extract_vector_points.rs index 3eb761bce..1e56bec83 100644 --- a/milli/src/update/index_documents/extract/extract_vector_points.rs +++ b/milli/src/update/index_documents/extract/extract_vector_points.rs @@ -8,7 +8,6 @@ use std::sync::Arc; use bytemuck::cast_slice; use grenad::Writer; -use itertools::EitherOrBoth; use ordered_float::OrderedFloat; use roaring::RoaringBitmap; use serde_json::Value; @@ -50,7 +49,7 @@ enum VectorStateDelta { // Note: changing the value of the manually specified vector **should not record** this delta WasGeneratedNowManual(Vec>), - ManualDelta(Vec>, Vec>), + ManualDelta(Vec>), // Add the vector computed from the specified prompt // Remove any previous vector @@ -59,14 +58,12 @@ enum VectorStateDelta { } impl VectorStateDelta { - fn into_values(self) -> (bool, String, (Vec>, Vec>)) { + fn into_values(self) -> (bool, String, Vec>) { match self { VectorStateDelta::NoChange => Default::default(), VectorStateDelta::NowRemoved => (true, Default::default(), Default::default()), - VectorStateDelta::WasGeneratedNowManual(add) => { - (true, Default::default(), (Default::default(), add)) - } - VectorStateDelta::ManualDelta(del, add) => (false, Default::default(), (del, add)), + VectorStateDelta::WasGeneratedNowManual(add) => (true, Default::default(), add), + VectorStateDelta::ManualDelta(add) => (false, Default::default(), add), VectorStateDelta::NowGenerated(prompt) => (true, prompt, Default::default()), } } @@ -166,8 +163,14 @@ pub fn extract_vector_points( // lazily get it when needed let document_id = || -> Value { from_utf8(external_id_bytes).unwrap().into() }; - let mut parsed_vectors = ParsedVectorsDiff::new(obkv, old_vectors_fid, new_vectors_fid) - .map_err(|error| error.to_crate_error(document_id().to_string()))?; + let mut parsed_vectors = ParsedVectorsDiff::new( + docid, + embedders_configs, + obkv, + old_vectors_fid, + new_vectors_fid, + ) + .map_err(|error| error.to_crate_error(document_id().to_string()))?; for EmbedderVectorExtractor { embedder_name, @@ -182,7 +185,7 @@ pub fn extract_vector_points( { let delta = match parsed_vectors.remove(embedder_name) { (Some(old), Some(new)) => { - match (old.is_user_provided(), new.is_user_provided()) { + match (old.map_or(true, |old| old.is_user_provided()), new.is_user_provided()) { (true, true) | (false, false) => (), (true, false) => { remove_from_user_defined.insert(docid); @@ -193,7 +196,6 @@ pub fn extract_vector_points( } // no autogeneration - let del_vectors = old.into_array_of_vectors(); let add_vectors = new.into_array_of_vectors(); if add_vectors.len() > usize::from(u8::MAX) { @@ -203,15 +205,15 @@ pub fn extract_vector_points( ))); } - VectorStateDelta::ManualDelta(del_vectors, add_vectors) + VectorStateDelta::ManualDelta(add_vectors) } - (Some(_old), None) => { + (Some(old), None) => { // Do we keep this document? let document_is_kept = obkv .iter() .map(|(_, deladd)| KvReaderDelAdd::new(deladd)) .any(|deladd| deladd.get(DelAdd::Addition).is_some()); - if document_is_kept { + if document_is_kept && old.is_some() { remove_from_user_defined.insert(docid); // becomes autogenerated VectorStateDelta::NowGenerated(prompt.render( @@ -219,6 +221,8 @@ pub fn extract_vector_points( DelAdd::Addition, new_fields_ids_map, )?) + } else if document_is_kept && old.is_none() { + VectorStateDelta::NoChange } else { VectorStateDelta::NowRemoved } @@ -315,8 +319,8 @@ pub fn extract_vector_points( Ok(results) } -/// Computes the diff between both Del and Add numbers and -/// only inserts the parts that differ in the sorter. +/// We cannot compute the diff between both Del and Add vectors. +/// We'll push every vector and compute the difference later in TypedChunk. fn push_vectors_diff( remove_vectors_writer: &mut Writer>, prompts_writer: &mut Writer>, @@ -325,7 +329,7 @@ fn push_vectors_diff( delta: VectorStateDelta, reindex_vectors: bool, ) -> Result<()> { - let (must_remove, prompt, (mut del_vectors, mut add_vectors)) = delta.into_values(); + let (must_remove, prompt, mut add_vectors) = delta.into_values(); if must_remove // TODO: the below condition works because we erase the vec database when a embedding setting changes. // When vector pipeline will be optimized, this should be removed. @@ -340,44 +344,25 @@ fn push_vectors_diff( } // We sort and dedup the vectors - del_vectors.sort_unstable_by(|a, b| compare_vectors(a, b)); add_vectors.sort_unstable_by(|a, b| compare_vectors(a, b)); - del_vectors.dedup_by(|a, b| compare_vectors(a, b).is_eq()); add_vectors.dedup_by(|a, b| compare_vectors(a, b).is_eq()); - let merged_vectors_iter = - itertools::merge_join_by(del_vectors, add_vectors, |del, add| compare_vectors(del, add)); + // let merged_vectors_iter = + // itertools::merge_join_by(del_vectors, add_vectors, |del, add| compare_vectors(del, add)); // insert vectors into the writer - for (i, eob) in merged_vectors_iter.into_iter().enumerate().take(u16::MAX as usize) { + for (i, vector) in add_vectors.into_iter().enumerate().take(u16::MAX as usize) { // Generate the key by extending the unique index to it. key_buffer.truncate(TRUNCATE_SIZE); let index = u16::try_from(i).unwrap(); key_buffer.extend_from_slice(&index.to_be_bytes()); - match eob { - EitherOrBoth::Both(_, _) => (), // no need to touch anything - EitherOrBoth::Left(vector) => { - // TODO: the below condition works because we erase the vec database when a embedding setting changes. - // When vector pipeline will be optimized, this should be removed. - if !reindex_vectors { - // We insert only the Del part of the Obkv to inform - // that we only want to remove all those vectors. - let mut obkv = KvWriterDelAdd::memory(); - obkv.insert(DelAdd::Deletion, cast_slice(&vector))?; - let bytes = obkv.into_inner()?; - manual_vectors_writer.insert(&key_buffer, bytes)?; - } - } - EitherOrBoth::Right(vector) => { - // We insert only the Add part of the Obkv to inform - // that we only want to remove all those vectors. - let mut obkv = KvWriterDelAdd::memory(); - obkv.insert(DelAdd::Addition, cast_slice(&vector))?; - let bytes = obkv.into_inner()?; - manual_vectors_writer.insert(&key_buffer, bytes)?; - } - } + // We insert only the Add part of the Obkv to inform + // that we only want to remove all those vectors. + let mut obkv = KvWriterDelAdd::memory(); + obkv.insert(DelAdd::Addition, cast_slice(&vector))?; + let bytes = obkv.into_inner()?; + manual_vectors_writer.insert(&key_buffer, bytes)?; } Ok(()) diff --git a/milli/src/vector/parsed_vectors.rs b/milli/src/vector/parsed_vectors.rs index 62c418149..672e27cc5 100644 --- a/milli/src/vector/parsed_vectors.rs +++ b/milli/src/vector/parsed_vectors.rs @@ -4,8 +4,9 @@ use obkv::KvReader; use serde_json::{from_slice, Value}; use super::Embedding; +use crate::index::IndexEmbeddingConfig; use crate::update::del_add::{DelAdd, KvReaderDelAdd}; -use crate::{FieldId, InternalError, UserError}; +use crate::{DocumentId, FieldId, InternalError, UserError}; pub const RESERVED_VECTORS_FIELD_NAME: &str = "_vectors"; @@ -42,17 +43,19 @@ pub struct ExplicitVectors { } pub struct ParsedVectorsDiff { - pub old: Option>, + pub old: BTreeMap>, pub new: Option>, } impl ParsedVectorsDiff { pub fn new( + docid: DocumentId, + embedders_configs: &[IndexEmbeddingConfig], documents_diff: KvReader<'_, FieldId>, old_vectors_fid: Option, new_vectors_fid: Option, ) -> Result { - let old = match old_vectors_fid + let mut old = match old_vectors_fid .and_then(|vectors_fid| documents_diff.get(vectors_fid)) .map(KvReaderDelAdd::new) .map(|obkv| to_vector_map(obkv, DelAdd::Deletion)) @@ -68,7 +71,13 @@ impl ParsedVectorsDiff { return Err(error); } } - .flatten(); + .flatten().map_or(BTreeMap::default(), |del| del.into_iter().map(|(name, vec)| (name, Some(vec))).collect()); + for embedding_config in embedders_configs { + if embedding_config.user_defined.contains(docid) { + old.entry(embedding_config.name.to_string()).or_insert(None); + } + } + let new = new_vectors_fid .and_then(|vectors_fid| documents_diff.get(vectors_fid)) .map(KvReaderDelAdd::new) @@ -78,8 +87,9 @@ impl ParsedVectorsDiff { Ok(Self { old, new }) } - pub fn remove(&mut self, embedder_name: &str) -> (Option, Option) { - let old = self.old.as_mut().and_then(|old| old.remove(embedder_name)); + /// Return (Some(None), _) in case the vector is user defined and contained in the database. + pub fn remove(&mut self, embedder_name: &str) -> (Option>, Option) { + let old = self.old.remove(embedder_name); let new = self.new.as_mut().and_then(|new| new.remove(embedder_name)); (old, new) } From cc5dca8321736805b881bcb8679f566300a8f9e8 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 4 Jun 2024 16:41:33 +0200 Subject: [PATCH 035/110] fix two bug and add a dump test --- index-scheduler/src/batch.rs | 26 +-- meilisearch/src/routes/indexes/search.rs | 4 +- meilisearch/src/search.rs | 12 +- meilisearch/tests/dumps/mod.rs | 206 +++++++++++++++++++++++ 4 files changed, 234 insertions(+), 14 deletions(-) diff --git a/index-scheduler/src/batch.rs b/index-scheduler/src/batch.rs index 181ac49a3..d59a657c9 100644 --- a/index-scheduler/src/batch.rs +++ b/index-scheduler/src/batch.rs @@ -909,6 +909,7 @@ impl IndexScheduler { let fields_ids_map = index.fields_ids_map(&rtxn)?; let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect(); + let embedding_configs = index.embedding_configs(&rtxn)?; // 3.1. Dump the documents for ret in index.all_documents(&rtxn)? { @@ -951,16 +952,21 @@ impl IndexScheduler { }; for (embedder_name, embeddings) in embeddings { - // don't change the entry if it already exists, because it was user-provided - vectors.entry(embedder_name).or_insert_with(|| { - let embeddings = ExplicitVectors { - embeddings: VectorOrArrayOfVectors::from_array_of_vectors( - embeddings, - ), - user_provided: false, - }; - serde_json::to_value(embeddings).unwrap() - }); + let user_provided = embedding_configs + .iter() + .find(|conf| conf.name == embedder_name) + .is_some_and(|conf| conf.user_defined.contains(id)); + + let embeddings = ExplicitVectors { + embeddings: VectorOrArrayOfVectors::from_array_of_vectors( + embeddings, + ), + user_provided, + }; + vectors.insert( + embedder_name, + serde_json::to_value(embeddings).unwrap(), + ); } } diff --git a/meilisearch/src/routes/indexes/search.rs b/meilisearch/src/routes/indexes/search.rs index 91c8c8178..ae6402cf6 100644 --- a/meilisearch/src/routes/indexes/search.rs +++ b/meilisearch/src/routes/indexes/search.rs @@ -52,7 +52,7 @@ pub struct SearchQueryGet { #[deserr(default, error = DeserrQueryParamError)] attributes_to_retrieve: Option>, #[deserr(default, error = DeserrQueryParamError)] - retrieve_vectors: bool, + retrieve_vectors: Param, #[deserr(default, error = DeserrQueryParamError)] attributes_to_crop: Option>, #[deserr(default = Param(DEFAULT_CROP_LENGTH()), error = DeserrQueryParamError)] @@ -155,7 +155,7 @@ impl From for SearchQuery { page: other.page.as_deref().copied(), hits_per_page: other.hits_per_page.as_deref().copied(), attributes_to_retrieve: other.attributes_to_retrieve.map(|o| o.into_iter().collect()), - retrieve_vectors: other.retrieve_vectors, + retrieve_vectors: other.retrieve_vectors.0, attributes_to_crop: other.attributes_to_crop.map(|o| o.into_iter().collect()), crop_length: other.crop_length.0, attributes_to_highlight: other.attributes_to_highlight.map(|o| o.into_iter().collect()), diff --git a/meilisearch/src/search.rs b/meilisearch/src/search.rs index 1ab42a79f..d80910f09 100644 --- a/meilisearch/src/search.rs +++ b/meilisearch/src/search.rs @@ -1051,6 +1051,7 @@ fn make_hits( formatter_builder.highlight_prefix(format.highlight_pre_tag); formatter_builder.highlight_suffix(format.highlight_post_tag); let mut documents = Vec::new(); + let embedding_configs = index.embedding_configs(&rtxn)?; let documents_iter = index.documents(rtxn, documents_ids)?; for ((id, obkv), score) in documents_iter.into_iter().zip(document_scores.into_iter()) { // First generate a document with all the displayed fields @@ -1066,12 +1067,19 @@ fn make_hits( if retrieve_vectors { let mut vectors = serde_json::Map::new(); for (name, mut vector) in index.embeddings(&rtxn, id)? { + let user_defined = embedding_configs + .iter() + .find(|conf| conf.name == name) + .is_some_and(|conf| conf.user_defined.contains(id)); + let mut embedding = serde_json::Map::new(); + embedding.insert("userDefined".to_string(), user_defined.into()); if vector.len() == 1 { let vector = vector.pop().unwrap(); - vectors.insert(name.into(), vector.into()); + embedding.insert("embedding".to_string(), vector.into()); } else { - vectors.insert(name.into(), vector.into()); + embedding.insert("embedding".to_string(), vector.into()); } + vectors.insert(name.into(), embedding.into()); } document.insert("_vectors".into(), vectors.into()); } diff --git a/meilisearch/tests/dumps/mod.rs b/meilisearch/tests/dumps/mod.rs index c8f8ca105..dfac2e806 100644 --- a/meilisearch/tests/dumps/mod.rs +++ b/meilisearch/tests/dumps/mod.rs @@ -1938,3 +1938,209 @@ async fn import_dump_v6_containing_experimental_features() { }) .await; } + +// In this test we must generate the dump ourselves to ensure the +// `user defined` vectors are well set +#[actix_rt::test] +async fn generate_and_import_dump_containing_vectors() { + let temp = tempfile::tempdir().unwrap(); + let mut opt = default_settings(temp.path()); + let server = Server::new_with_options(opt.clone()).await.unwrap(); + let (code, _) = server.set_features(json!({"vectorStore": true})).await; + snapshot!(code, @r###" + { + "vectorStore": true, + "metrics": false, + "logsRoute": false + } + "###); + let index = server.index("pets"); + let (response, code) = index + .update_settings(json!( + { + "embedders": { + "doggo_embedder": { + "source": "huggingFace", + "model": "sentence-transformers/all-MiniLM-L6-v2", + "revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e", + "documentTemplate": "{{doc.doggo}}", + } + } + } + )) + .await; + snapshot!(code, @"202 Accepted"); + let response = index.wait_task(response.uid()).await; + snapshot!(response); + let (response, code) = index + .add_documents( + json!([ + {"id": 0, "doggo": "kefir", "_vectors": { "doggo_embedder": vec![0; 384] }}, + {"id": 1, "doggo": "echo", "_vectors": { "doggo_embedder": { "userProvided": true, "embeddings": vec![1; 384] }}}, + {"id": 2, "doggo": "intel", "_vectors": { "doggo_embedder": { "userProvided": false, "embeddings": vec![2; 384] }}}, + {"id": 3, "doggo": "bill", "_vectors": { "doggo_embedder": { "userProvided": false }}}, + {"id": 4, "doggo": "max" }, + ]), + None, + ) + .await; + snapshot!(code, @"202 Accepted"); + let response = index.wait_task(response.uid()).await; + snapshot!(response); + + let (response, code) = server.create_dump().await; + snapshot!(code, @"202 Accepted"); + let response = index.wait_task(response.uid()).await; + snapshot!(response["status"], @r###""succeeded""###); + + // ========= We made a dump, now we should clear the DB except and try to import our dump + drop(server); + tokio::fs::remove_dir_all(&opt.db_path).await.unwrap(); + let dump_name = format!("{}.dump", response["details"]["dumpUid"].as_str().unwrap()); + let dump_path = opt.dump_dir.join(dump_name); + assert!(dump_path.exists(), "path: `{}`", dump_path.display()); + + opt.import_dump = Some(dump_path); + // NOTE: We shouldn't have to change the database path but I lost one hour + // because of a « bad path » error and that fixed it. + opt.db_path = temp.path().join("data.ms"); + + let mut server = Server::new_auth_with_options(opt, temp).await; + server.use_api_key("MASTER_KEY"); + + let (indexes, code) = server.list_indexes(None, None).await; + assert_eq!(code, 200, "{indexes}"); + + snapshot!(indexes["results"].as_array().unwrap().len(), @"1"); + snapshot!(indexes["results"][0]["uid"], @r###""pets""###); + snapshot!(indexes["results"][0]["primaryKey"], @r###""id""###); + + let (response, code) = server.get_features().await; + meili_snap::snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "vectorStore": true, + "metrics": false, + "logsRoute": false + } + "###); + + let index = server.index("pets"); + + let (response, code) = index.settings().await; + meili_snap::snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "displayedAttributes": [ + "*" + ], + "searchableAttributes": [ + "*" + ], + "filterableAttributes": [], + "sortableAttributes": [], + "rankingRules": [ + "words", + "typo", + "proximity", + "attribute", + "sort", + "exactness" + ], + "stopWords": [], + "nonSeparatorTokens": [], + "separatorTokens": [], + "dictionary": [], + "synonyms": {}, + "distinctAttribute": null, + "proximityPrecision": "byWord", + "typoTolerance": { + "enabled": true, + "minWordSizeForTypos": { + "oneTypo": 5, + "twoTypos": 9 + }, + "disableOnWords": [], + "disableOnAttributes": [] + }, + "faceting": { + "maxValuesPerFacet": 100, + "sortFacetValuesBy": { + "*": "alpha" + } + }, + "pagination": { + "maxTotalHits": 1000 + }, + "embedders": { + "doggo_embedder": { + "source": "huggingFace", + "model": "sentence-transformers/all-MiniLM-L6-v2", + "revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e", + "documentTemplate": "{{doc.doggo}}" + } + }, + "searchCutoffMs": null + } + "###); + + index + .search(json!({"retrieveVectors": true}), |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"], { "[]._vectors.doggo_embedder.embedding" => "[vector]" }), @r###" + [ + { + "id": 0, + "doggo": "kefir", + "_vectors": { + "doggo_embedder": { + "userDefined": true, + "embedding": "[vector]" + } + } + }, + { + "id": 1, + "doggo": "echo", + "_vectors": { + "doggo_embedder": { + "userDefined": true, + "embedding": "[vector]" + } + } + }, + { + "id": 2, + "doggo": "intel", + "_vectors": { + "doggo_embedder": { + "userDefined": false, + "embedding": "[vector]" + } + } + }, + { + "id": 3, + "doggo": "bill", + "_vectors": { + "doggo_embedder": { + "userDefined": false, + "embedding": "[vector]" + } + } + }, + { + "id": 4, + "doggo": "max", + "_vectors": { + "doggo_embedder": { + "userDefined": false, + "embedding": "[vector]" + } + } + } + ] + "###); + }) + .await; +} From caad40964a94ee0b751d61544d8874abfb3f75d7 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 4 Jun 2024 17:27:31 +0200 Subject: [PATCH 036/110] implements the analytics --- meilisearch/src/analytics/segment_analytics.rs | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/meilisearch/src/analytics/segment_analytics.rs b/meilisearch/src/analytics/segment_analytics.rs index 3eb74c7d1..6e91b99b0 100644 --- a/meilisearch/src/analytics/segment_analytics.rs +++ b/meilisearch/src/analytics/segment_analytics.rs @@ -622,6 +622,7 @@ pub struct SearchAggregator { // Whether a non-default embedder was specified embedder: bool, hybrid: bool, + retrieve_vectors: bool, // every time a search is done, we increment the counter linked to the used settings matching_strategy: HashMap, @@ -662,7 +663,7 @@ impl SearchAggregator { page, hits_per_page, attributes_to_retrieve: _, - retrieve_vectors: _, + retrieve_vectors, attributes_to_crop: _, crop_length, attributes_to_highlight: _, @@ -729,6 +730,7 @@ impl SearchAggregator { if let Some(ref vector) = vector { ret.max_vector_size = vector.len(); } + ret.retrieve_vectors |= retrieve_vectors; if query.is_finite_pagination() { let limit = hits_per_page.unwrap_or_else(DEFAULT_SEARCH_LIMIT); @@ -804,6 +806,7 @@ impl SearchAggregator { attributes_to_search_on_total_number_of_uses, max_terms_number, max_vector_size, + retrieve_vectors, matching_strategy, max_limit, max_offset, @@ -874,6 +877,7 @@ impl SearchAggregator { // vector self.max_vector_size = self.max_vector_size.max(max_vector_size); + self.retrieve_vectors |= retrieve_vectors; self.semantic_ratio |= semantic_ratio; self.hybrid |= hybrid; self.embedder |= embedder; @@ -930,6 +934,7 @@ impl SearchAggregator { attributes_to_search_on_total_number_of_uses, max_terms_number, max_vector_size, + retrieve_vectors, matching_strategy, max_limit, max_offset, @@ -992,6 +997,7 @@ impl SearchAggregator { }, "vector": { "max_vector_size": max_vector_size, + "retrieve_vectors": retrieve_vectors, }, "hybrid": { "enabled": hybrid, @@ -1625,6 +1631,7 @@ pub struct SimilarAggregator { // Whether a non-default embedder was specified embedder: bool, + retrieve_vectors: bool, // pagination max_limit: usize, @@ -1648,7 +1655,7 @@ impl SimilarAggregator { offset, limit, attributes_to_retrieve: _, - retrieve_vectors: _, + retrieve_vectors, show_ranking_score, show_ranking_score_details, filter, @@ -1693,6 +1700,7 @@ impl SimilarAggregator { ret.ranking_score_threshold = ranking_score_threshold.is_some(); ret.embedder = embedder.is_some(); + ret.retrieve_vectors = *retrieve_vectors; ret } @@ -1725,6 +1733,7 @@ impl SimilarAggregator { show_ranking_score_details, embedder, ranking_score_threshold, + retrieve_vectors, } = other; if self.timestamp.is_none() { @@ -1754,6 +1763,7 @@ impl SimilarAggregator { } self.embedder |= embedder; + self.retrieve_vectors |= retrieve_vectors; // pagination self.max_limit = self.max_limit.max(max_limit); @@ -1788,6 +1798,7 @@ impl SimilarAggregator { show_ranking_score_details, embedder, ranking_score_threshold, + retrieve_vectors, } = self; if total_received == 0 { @@ -1814,6 +1825,9 @@ impl SimilarAggregator { "avg_criteria_number": format!("{:.2}", filter_sum_of_criteria_terms as f64 / filter_total_number_of_criteria as f64), "most_used_syntax": used_syntax.iter().max_by_key(|(_, v)| *v).map(|(k, _)| json!(k)).unwrap_or_else(|| json!(null)), }, + "vector": { + "retrieve_vectors": retrieve_vectors, + }, "hybrid": { "embedder": embedder, }, From 6b29676e7eaaa56bb6ddee2efe5b34636723e538 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 4 Jun 2024 17:38:28 +0200 Subject: [PATCH 037/110] update snapshots --- .../1.snap | 25 +++ .../2.snap | 19 ++ meilisearch/tests/search/hybrid.rs | 30 +-- meilisearch/tests/search/mod.rs | 65 ++++--- meilisearch/tests/similar/mod.rs | 182 +++++++++++------- 5 files changed, 211 insertions(+), 110 deletions(-) create mode 100644 meilisearch/tests/dumps/snapshots/mod.rs/generate_and_import_dump_containing_vectors/1.snap create mode 100644 meilisearch/tests/dumps/snapshots/mod.rs/generate_and_import_dump_containing_vectors/2.snap diff --git a/meilisearch/tests/dumps/snapshots/mod.rs/generate_and_import_dump_containing_vectors/1.snap b/meilisearch/tests/dumps/snapshots/mod.rs/generate_and_import_dump_containing_vectors/1.snap new file mode 100644 index 000000000..4b05d417a --- /dev/null +++ b/meilisearch/tests/dumps/snapshots/mod.rs/generate_and_import_dump_containing_vectors/1.snap @@ -0,0 +1,25 @@ +--- +source: meilisearch/tests/dumps/mod.rs +--- +{ + "uid": 0, + "indexUid": "pets", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "doggo_embedder": { + "source": "huggingFace", + "model": "sentence-transformers/all-MiniLM-L6-v2", + "revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e", + "documentTemplate": "{{doc.doggo}}" + } + } + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" +} diff --git a/meilisearch/tests/dumps/snapshots/mod.rs/generate_and_import_dump_containing_vectors/2.snap b/meilisearch/tests/dumps/snapshots/mod.rs/generate_and_import_dump_containing_vectors/2.snap new file mode 100644 index 000000000..43971924b --- /dev/null +++ b/meilisearch/tests/dumps/snapshots/mod.rs/generate_and_import_dump_containing_vectors/2.snap @@ -0,0 +1,19 @@ +--- +source: meilisearch/tests/dumps/mod.rs +--- +{ + "uid": 1, + "indexUid": "pets", + "status": "succeeded", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 5, + "indexedDocuments": 5 + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" +} diff --git a/meilisearch/tests/search/hybrid.rs b/meilisearch/tests/search/hybrid.rs index 1e415bc63..713dbe3bb 100644 --- a/meilisearch/tests/search/hybrid.rs +++ b/meilisearch/tests/search/hybrid.rs @@ -128,7 +128,7 @@ async fn simple_search() { ) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]}},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]}},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]}}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"userDefined":true,"embedding":[1.0,2.0]}}},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"userDefined":true,"embedding":[2.0,3.0]}}},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"userDefined":true,"embedding":[1.0,3.0]}}}]"###); snapshot!(response["semanticHitCount"], @"0"); let (response, code) = index @@ -137,7 +137,7 @@ async fn simple_search() { ) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.9848484848484848},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":0.9472135901451112}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"userDefined":true,"embedding":[2.0,3.0]}},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"userDefined":true,"embedding":[1.0,2.0]}},"_rankingScore":0.9848484848484848},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"userDefined":true,"embedding":[1.0,3.0]}},"_rankingScore":0.9472135901451112}]"###); snapshot!(response["semanticHitCount"], @"2"); let (response, code) = index @@ -146,7 +146,7 @@ async fn simple_search() { ) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":0.9472135901451112}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"userDefined":true,"embedding":[2.0,3.0]}},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"userDefined":true,"embedding":[1.0,2.0]}},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"userDefined":true,"embedding":[1.0,3.0]}},"_rankingScore":0.9472135901451112}]"###); snapshot!(response["semanticHitCount"], @"3"); } @@ -207,7 +207,7 @@ async fn distribution_shift() { let search = json!({"q": "Captain", "vector": [1.0, 1.0], "showRankingScore": true, "hybrid": {"semanticRatio": 1.0}, "retrieveVectors": true}); let (response, code) = index.search_post(search.clone()).await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":0.9472135901451112}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"userDefined":true,"embedding":[2.0,3.0]}},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"userDefined":true,"embedding":[1.0,2.0]}},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"userDefined":true,"embedding":[1.0,3.0]}},"_rankingScore":0.9472135901451112}]"###); let (response, code) = index .update_settings(json!({ @@ -228,7 +228,7 @@ async fn distribution_shift() { let (response, code) = index.search_post(search).await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.19161224365234375},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":1.1920928955078125e-7},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":1.1920928955078125e-7}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"userDefined":true,"embedding":[2.0,3.0]}},"_rankingScore":0.19161224365234375},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"userDefined":true,"embedding":[1.0,2.0]}},"_rankingScore":1.1920928955078125e-7},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"userDefined":true,"embedding":[1.0,3.0]}},"_rankingScore":1.1920928955078125e-7}]"###); } #[actix_rt::test] @@ -249,7 +249,7 @@ async fn highlighter() { })) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"}},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_formatted":{"title":"Shazam!","desc":"a **BEGIN**Captain**END** **BEGIN**Marvel**END** ersatz","id":"1"}},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_formatted":{"title":"Captain Planet","desc":"He's not part of the **BEGIN**Marvel**END** Cinematic Universe","id":"2"}}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"userDefined":true,"embedding":[2.0,3.0]}},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"}},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"userDefined":true,"embedding":[1.0,3.0]}},"_formatted":{"title":"Shazam!","desc":"a **BEGIN**Captain**END** **BEGIN**Marvel**END** ersatz","id":"1"}},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"userDefined":true,"embedding":[1.0,2.0]}},"_formatted":{"title":"Captain Planet","desc":"He's not part of the **BEGIN**Marvel**END** Cinematic Universe","id":"2"}}]"###); snapshot!(response["semanticHitCount"], @"0"); let (response, code) = index @@ -265,7 +265,7 @@ async fn highlighter() { })) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_formatted":{"title":"Captain Planet","desc":"He's not part of the **BEGIN**Marvel**END** Cinematic Universe","id":"2"},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_formatted":{"title":"Shazam!","desc":"a **BEGIN**Captain**END** **BEGIN**Marvel**END** ersatz","id":"1"},"_rankingScore":0.9472135901451112}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"userDefined":true,"embedding":[2.0,3.0]}},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"userDefined":true,"embedding":[1.0,2.0]}},"_formatted":{"title":"Captain Planet","desc":"He's not part of the **BEGIN**Marvel**END** Cinematic Universe","id":"2"},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"userDefined":true,"embedding":[1.0,3.0]}},"_formatted":{"title":"Shazam!","desc":"a **BEGIN**Captain**END** **BEGIN**Marvel**END** ersatz","id":"1"},"_rankingScore":0.9472135901451112}]"###); snapshot!(response["semanticHitCount"], @"3"); // no highlighting on full semantic @@ -282,7 +282,7 @@ async fn highlighter() { })) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_formatted":{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2"},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_formatted":{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1"},"_rankingScore":0.9472135901451112}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"userDefined":true,"embedding":[2.0,3.0]}},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"userDefined":true,"embedding":[1.0,2.0]}},"_formatted":{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2"},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"userDefined":true,"embedding":[1.0,3.0]}},"_formatted":{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1"},"_rankingScore":0.9472135901451112}]"###); snapshot!(response["semanticHitCount"], @"3"); } @@ -370,7 +370,7 @@ async fn single_document() { .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"][0], @r###"{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":1.0}"###); + snapshot!(response["hits"][0], @r###"{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"userDefined":true,"embedding":[1.0,3.0]}},"_rankingScore":1.0}"###); snapshot!(response["semanticHitCount"], @"1"); } @@ -385,7 +385,7 @@ async fn query_combination() { .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":1.0},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":1.0},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":1.0}]"###); + snapshot!(response["hits"], @r###"[{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"userDefined":true,"embedding":[1.0,3.0]}},"_rankingScore":1.0},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"userDefined":true,"embedding":[1.0,2.0]}},"_rankingScore":1.0},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"userDefined":true,"embedding":[2.0,3.0]}},"_rankingScore":1.0}]"###); snapshot!(response["semanticHitCount"], @"null"); // same with a different semantic ratio @@ -394,7 +394,7 @@ async fn query_combination() { .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":1.0},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":1.0},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":1.0}]"###); + snapshot!(response["hits"], @r###"[{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"userDefined":true,"embedding":[1.0,3.0]}},"_rankingScore":1.0},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"userDefined":true,"embedding":[1.0,2.0]}},"_rankingScore":1.0},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"userDefined":true,"embedding":[2.0,3.0]}},"_rankingScore":1.0}]"###); snapshot!(response["semanticHitCount"], @"null"); // wrong vector dimensions @@ -418,7 +418,7 @@ async fn query_combination() { .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.7773500680923462},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.7236068248748779},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":0.6581138968467712}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"userDefined":true,"embedding":[2.0,3.0]}},"_rankingScore":0.7773500680923462},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"userDefined":true,"embedding":[1.0,2.0]}},"_rankingScore":0.7236068248748779},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"userDefined":true,"embedding":[1.0,3.0]}},"_rankingScore":0.6581138968467712}]"###); snapshot!(response["semanticHitCount"], @"3"); // full keyword, without a query @@ -427,7 +427,7 @@ async fn query_combination() { .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":1.0},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":1.0},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":1.0}]"###); + snapshot!(response["hits"], @r###"[{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"userDefined":true,"embedding":[1.0,3.0]}},"_rankingScore":1.0},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"userDefined":true,"embedding":[1.0,2.0]}},"_rankingScore":1.0},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"userDefined":true,"embedding":[2.0,3.0]}},"_rankingScore":1.0}]"###); snapshot!(response["semanticHitCount"], @"null"); // query + vector, full keyword => keyword @@ -436,7 +436,7 @@ async fn query_combination() { .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.9848484848484848},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.9848484848484848},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":0.9242424242424242}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"userDefined":true,"embedding":[1.0,2.0]}},"_rankingScore":0.9848484848484848},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"userDefined":true,"embedding":[2.0,3.0]}},"_rankingScore":0.9848484848484848},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"userDefined":true,"embedding":[1.0,3.0]}},"_rankingScore":0.9242424242424242}]"###); snapshot!(response["semanticHitCount"], @"null"); // query + vector, no hybrid keyword => @@ -479,6 +479,6 @@ async fn query_combination() { .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.9242424242424242}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"userDefined":true,"embedding":[1.0,2.0]}},"_rankingScore":0.9242424242424242}]"###); snapshot!(response["semanticHitCount"], @"0"); } diff --git a/meilisearch/tests/search/mod.rs b/meilisearch/tests/search/mod.rs index 955b324a6..2a2b23fd5 100644 --- a/meilisearch/tests/search/mod.rs +++ b/meilisearch/tests/search/mod.rs @@ -1349,11 +1349,14 @@ async fn experimental_feature_vector_store() { "title": "Shazam!", "id": "287947", "_vectors": { - "manual": [ - 1.0, - 2.0, - 3.0 - ] + "manual": { + "userDefined": true, + "embedding": [ + 1.0, + 2.0, + 3.0 + ] + } }, "_rankingScore": 1.0 }, @@ -1361,11 +1364,14 @@ async fn experimental_feature_vector_store() { "title": "Captain Marvel", "id": "299537", "_vectors": { - "manual": [ - 1.0, - 2.0, - 54.0 - ] + "manual": { + "userDefined": true, + "embedding": [ + 1.0, + 2.0, + 54.0 + ] + } }, "_rankingScore": 0.9129111766815186 }, @@ -1373,11 +1379,14 @@ async fn experimental_feature_vector_store() { "title": "Gläss", "id": "450465", "_vectors": { - "manual": [ - -100.0, - 340.0, - 90.0 - ] + "manual": { + "userDefined": true, + "embedding": [ + -100.0, + 340.0, + 90.0 + ] + } }, "_rankingScore": 0.8106412887573242 }, @@ -1385,11 +1394,14 @@ async fn experimental_feature_vector_store() { "title": "How to Train Your Dragon: The Hidden World", "id": "166428", "_vectors": { - "manual": [ - -100.0, - 231.0, - 32.0 - ] + "manual": { + "userDefined": true, + "embedding": [ + -100.0, + 231.0, + 32.0 + ] + } }, "_rankingScore": 0.7412010431289673 }, @@ -1397,11 +1409,14 @@ async fn experimental_feature_vector_store() { "title": "Escape Room", "id": "522681", "_vectors": { - "manual": [ - 10.0, - -23.0, - 32.0 - ] + "manual": { + "userDefined": true, + "embedding": [ + 10.0, + -23.0, + 32.0 + ] + } }, "_rankingScore": 0.6972063183784485 } diff --git a/meilisearch/tests/similar/mod.rs b/meilisearch/tests/similar/mod.rs index f2af91588..7c9f4fff0 100644 --- a/meilisearch/tests/similar/mod.rs +++ b/meilisearch/tests/similar/mod.rs @@ -87,11 +87,14 @@ async fn basic() { "release_year": 2019, "id": "522681", "_vectors": { - "manual": [ - 0.10000000149011612, - 0.6000000238418579, - 0.800000011920929 - ] + "manual": { + "userDefined": true, + "embedding": [ + 0.10000000149011612, + 0.6000000238418579, + 0.800000011920929 + ] + } } }, { @@ -99,11 +102,14 @@ async fn basic() { "release_year": 2019, "id": "299537", "_vectors": { - "manual": [ - 0.6000000238418579, - 0.800000011920929, - -0.20000000298023224 - ] + "manual": { + "userDefined": true, + "embedding": [ + 0.6000000238418579, + 0.800000011920929, + -0.20000000298023224 + ] + } } }, { @@ -111,11 +117,14 @@ async fn basic() { "release_year": 2019, "id": "166428", "_vectors": { - "manual": [ - 0.699999988079071, - 0.699999988079071, - -0.4000000059604645 - ] + "manual": { + "userDefined": true, + "embedding": [ + 0.699999988079071, + 0.699999988079071, + -0.4000000059604645 + ] + } } }, { @@ -123,11 +132,14 @@ async fn basic() { "release_year": 2019, "id": "287947", "_vectors": { - "manual": [ - 0.800000011920929, - 0.4000000059604645, - -0.5 - ] + "manual": { + "userDefined": true, + "embedding": [ + 0.800000011920929, + 0.4000000059604645, + -0.5 + ] + } } } ] @@ -145,11 +157,14 @@ async fn basic() { "release_year": 2019, "id": "166428", "_vectors": { - "manual": [ - 0.699999988079071, - 0.699999988079071, - -0.4000000059604645 - ] + "manual": { + "userDefined": true, + "embedding": [ + 0.699999988079071, + 0.699999988079071, + -0.4000000059604645 + ] + } } }, { @@ -157,11 +172,14 @@ async fn basic() { "release_year": 2019, "id": "287947", "_vectors": { - "manual": [ - 0.800000011920929, - 0.4000000059604645, - -0.5 - ] + "manual": { + "userDefined": true, + "embedding": [ + 0.800000011920929, + 0.4000000059604645, + -0.5 + ] + } } }, { @@ -169,11 +187,14 @@ async fn basic() { "release_year": 2019, "id": "522681", "_vectors": { - "manual": [ - 0.10000000149011612, - 0.6000000238418579, - 0.800000011920929 - ] + "manual": { + "userDefined": true, + "embedding": [ + 0.10000000149011612, + 0.6000000238418579, + 0.800000011920929 + ] + } } }, { @@ -181,11 +202,14 @@ async fn basic() { "release_year": 1930, "id": "143", "_vectors": { - "manual": [ - -0.5, - 0.30000001192092896, - 0.8500000238418579 - ] + "manual": { + "userDefined": true, + "embedding": [ + -0.5, + 0.30000001192092896, + 0.8500000238418579 + ] + } } } ] @@ -467,11 +491,14 @@ async fn filter() { "release_year": 2019, "id": "299537", "_vectors": { - "manual": [ - 0.6000000238418579, - 0.800000011920929, - -0.20000000298023224 - ] + "manual": { + "userDefined": true, + "embedding": [ + 0.6000000238418579, + 0.800000011920929, + -0.20000000298023224 + ] + } } }, { @@ -479,11 +506,14 @@ async fn filter() { "release_year": 2019, "id": "166428", "_vectors": { - "manual": [ - 0.699999988079071, - 0.699999988079071, - -0.4000000059604645 - ] + "manual": { + "userDefined": true, + "embedding": [ + 0.699999988079071, + 0.699999988079071, + -0.4000000059604645 + ] + } } }, { @@ -491,11 +521,14 @@ async fn filter() { "release_year": 2019, "id": "287947", "_vectors": { - "manual": [ - 0.800000011920929, - 0.4000000059604645, - -0.5 - ] + "manual": { + "userDefined": true, + "embedding": [ + 0.800000011920929, + 0.4000000059604645, + -0.5 + ] + } } } ] @@ -516,11 +549,14 @@ async fn filter() { "release_year": 1930, "id": "143", "_vectors": { - "manual": [ - -0.5, - 0.30000001192092896, - 0.8500000238418579 - ] + "manual": { + "userDefined": true, + "embedding": [ + -0.5, + 0.30000001192092896, + 0.8500000238418579 + ] + } } } ] @@ -572,11 +608,14 @@ async fn limit_and_offset() { "release_year": 2019, "id": "522681", "_vectors": { - "manual": [ - 0.10000000149011612, - 0.6000000238418579, - 0.800000011920929 - ] + "manual": { + "userDefined": true, + "embedding": [ + 0.10000000149011612, + 0.6000000238418579, + 0.800000011920929 + ] + } } } ] @@ -596,11 +635,14 @@ async fn limit_and_offset() { "release_year": 2019, "id": "299537", "_vectors": { - "manual": [ - 0.6000000238418579, - 0.800000011920929, - -0.20000000298023224 - ] + "manual": { + "userDefined": true, + "embedding": [ + 0.6000000238418579, + 0.800000011920929, + -0.20000000298023224 + ] + } } } ] From b867829ef1d067d41217512619fa123a6269a3ab Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 4 Jun 2024 18:18:24 +0200 Subject: [PATCH 038/110] remove useless dbg --- index-scheduler/src/lib.rs | 1 - milli/src/vector/rest.rs | 1 - 2 files changed, 2 deletions(-) diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index f69736297..57eccbe66 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -5465,7 +5465,6 @@ mod tests { // the document with the id 4 should generate an embedding let docid = index.external_documents_ids.get(&rtxn, "4").unwrap().unwrap(); let embeddings = index.embeddings(&rtxn, docid).unwrap(); - dbg!(&embeddings); let embedding = &embeddings["my_doggo_embedder"]; assert!(!embedding.is_empty()); diff --git a/milli/src/vector/rest.rs b/milli/src/vector/rest.rs index e7fc509b3..fd771a228 100644 --- a/milli/src/vector/rest.rs +++ b/milli/src/vector/rest.rs @@ -163,7 +163,6 @@ impl Embedder { text_chunks: Vec>, threads: &ThreadPoolNoAbort, ) -> Result>>, EmbedError> { - dbg!(&text_chunks); threads .install(move || { text_chunks.into_par_iter().map(move |chunk| self.embed(chunk)).collect() From d92c173fdc1abcce46ec2489bac90448f4bc6673 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 5 Jun 2024 11:05:19 +0200 Subject: [PATCH 039/110] update the new similar tests --- meilisearch/tests/similar/mod.rs | 140 +++++++++++++++++++------------ 1 file changed, 85 insertions(+), 55 deletions(-) diff --git a/meilisearch/tests/similar/mod.rs b/meilisearch/tests/similar/mod.rs index 7c9f4fff0..2b70b3df5 100644 --- a/meilisearch/tests/similar/mod.rs +++ b/meilisearch/tests/similar/mod.rs @@ -252,7 +252,7 @@ async fn ranking_score_threshold() { index .similar( - json!({"id": 143, "showRankingScore": true, "rankingScoreThreshold": 0}), + json!({"id": 143, "showRankingScore": true, "rankingScoreThreshold": 0, "retrieveVectors": true}), |response, code| { snapshot!(code, @"200 OK"); meili_snap::snapshot!(meili_snap::json_string!(response["estimatedTotalHits"]), @"4"); @@ -263,11 +263,14 @@ async fn ranking_score_threshold() { "release_year": 2019, "id": "522681", "_vectors": { - "manual": [ - 0.1, - 0.6, - 0.8 - ] + "manual": { + "userDefined": true, + "embedding": [ + 0.10000000149011612, + 0.6000000238418579, + 0.800000011920929 + ] + } }, "_rankingScore": 0.890957772731781 }, @@ -276,11 +279,14 @@ async fn ranking_score_threshold() { "release_year": 2019, "id": "299537", "_vectors": { - "manual": [ - 0.6, - 0.8, - -0.2 - ] + "manual": { + "userDefined": true, + "embedding": [ + 0.6000000238418579, + 0.800000011920929, + -0.20000000298023224 + ] + } }, "_rankingScore": 0.39060014486312866 }, @@ -289,11 +295,14 @@ async fn ranking_score_threshold() { "release_year": 2019, "id": "166428", "_vectors": { - "manual": [ - 0.7, - 0.7, - -0.4 - ] + "manual": { + "userDefined": true, + "embedding": [ + 0.699999988079071, + 0.699999988079071, + -0.4000000059604645 + ] + } }, "_rankingScore": 0.2819308042526245 }, @@ -302,11 +311,14 @@ async fn ranking_score_threshold() { "release_year": 2019, "id": "287947", "_vectors": { - "manual": [ - 0.8, - 0.4, - -0.5 - ] + "manual": { + "userDefined": true, + "embedding": [ + 0.800000011920929, + 0.4000000059604645, + -0.5 + ] + } }, "_rankingScore": 0.1662663221359253 } @@ -318,7 +330,7 @@ async fn ranking_score_threshold() { index .similar( - json!({"id": 143, "showRankingScore": true, "rankingScoreThreshold": 0.2}), + json!({"id": 143, "showRankingScore": true, "rankingScoreThreshold": 0.2, "retrieveVectors": true}), |response, code| { snapshot!(code, @"200 OK"); meili_snap::snapshot!(meili_snap::json_string!(response["estimatedTotalHits"]), @"3"); @@ -329,11 +341,14 @@ async fn ranking_score_threshold() { "release_year": 2019, "id": "522681", "_vectors": { - "manual": [ - 0.1, - 0.6, - 0.8 - ] + "manual": { + "userDefined": true, + "embedding": [ + 0.10000000149011612, + 0.6000000238418579, + 0.800000011920929 + ] + } }, "_rankingScore": 0.890957772731781 }, @@ -342,11 +357,14 @@ async fn ranking_score_threshold() { "release_year": 2019, "id": "299537", "_vectors": { - "manual": [ - 0.6, - 0.8, - -0.2 - ] + "manual": { + "userDefined": true, + "embedding": [ + 0.6000000238418579, + 0.800000011920929, + -0.20000000298023224 + ] + } }, "_rankingScore": 0.39060014486312866 }, @@ -355,11 +373,14 @@ async fn ranking_score_threshold() { "release_year": 2019, "id": "166428", "_vectors": { - "manual": [ - 0.7, - 0.7, - -0.4 - ] + "manual": { + "userDefined": true, + "embedding": [ + 0.699999988079071, + 0.699999988079071, + -0.4000000059604645 + ] + } }, "_rankingScore": 0.2819308042526245 } @@ -371,7 +392,7 @@ async fn ranking_score_threshold() { index .similar( - json!({"id": 143, "showRankingScore": true, "rankingScoreThreshold": 0.3}), + json!({"id": 143, "showRankingScore": true, "rankingScoreThreshold": 0.3, "retrieveVectors": true}), |response, code| { snapshot!(code, @"200 OK"); meili_snap::snapshot!(meili_snap::json_string!(response["estimatedTotalHits"]), @"2"); @@ -382,11 +403,14 @@ async fn ranking_score_threshold() { "release_year": 2019, "id": "522681", "_vectors": { - "manual": [ - 0.1, - 0.6, - 0.8 - ] + "manual": { + "userDefined": true, + "embedding": [ + 0.10000000149011612, + 0.6000000238418579, + 0.800000011920929 + ] + } }, "_rankingScore": 0.890957772731781 }, @@ -395,11 +419,14 @@ async fn ranking_score_threshold() { "release_year": 2019, "id": "299537", "_vectors": { - "manual": [ - 0.6, - 0.8, - -0.2 - ] + "manual": { + "userDefined": true, + "embedding": [ + 0.6000000238418579, + 0.800000011920929, + -0.20000000298023224 + ] + } }, "_rankingScore": 0.39060014486312866 } @@ -411,7 +438,7 @@ async fn ranking_score_threshold() { index .similar( - json!({"id": 143, "showRankingScore": true, "rankingScoreThreshold": 0.6}), + json!({"id": 143, "showRankingScore": true, "rankingScoreThreshold": 0.6, "retrieveVectors": true}), |response, code| { snapshot!(code, @"200 OK"); meili_snap::snapshot!(meili_snap::json_string!(response["estimatedTotalHits"]), @"1"); @@ -422,11 +449,14 @@ async fn ranking_score_threshold() { "release_year": 2019, "id": "522681", "_vectors": { - "manual": [ - 0.1, - 0.6, - 0.8 - ] + "manual": { + "userDefined": true, + "embedding": [ + 0.10000000149011612, + 0.6000000238418579, + 0.800000011920929 + ] + } }, "_rankingScore": 0.890957772731781 } @@ -438,7 +468,7 @@ async fn ranking_score_threshold() { index .similar( - json!({"id": 143, "showRankingScore": true, "rankingScoreThreshold": 0.9}), + json!({"id": 143, "showRankingScore": true, "rankingScoreThreshold": 0.9, "retrieveVectors": true}), |response, code| { snapshot!(code, @"200 OK"); snapshot!(json_string!(response["hits"]), @"[]"); From 376b3a19a755da0158be9f632b4d3c913a8297b2 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 5 Jun 2024 11:21:06 +0200 Subject: [PATCH 040/110] makes clippy and fmt happy --- meilisearch/src/search.rs | 6 +++--- milli/src/fieldids_weights_map.rs | 3 ++- milli/src/update/index_documents/typed_chunk.rs | 2 +- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/meilisearch/src/search.rs b/meilisearch/src/search.rs index d80910f09..9b72ed596 100644 --- a/meilisearch/src/search.rs +++ b/meilisearch/src/search.rs @@ -1051,7 +1051,7 @@ fn make_hits( formatter_builder.highlight_prefix(format.highlight_pre_tag); formatter_builder.highlight_suffix(format.highlight_post_tag); let mut documents = Vec::new(); - let embedding_configs = index.embedding_configs(&rtxn)?; + let embedding_configs = index.embedding_configs(rtxn)?; let documents_iter = index.documents(rtxn, documents_ids)?; for ((id, obkv), score) in documents_iter.into_iter().zip(document_scores.into_iter()) { // First generate a document with all the displayed fields @@ -1066,7 +1066,7 @@ fn make_hits( if retrieve_vectors { let mut vectors = serde_json::Map::new(); - for (name, mut vector) in index.embeddings(&rtxn, id)? { + for (name, mut vector) in index.embeddings(rtxn, id)? { let user_defined = embedding_configs .iter() .find(|conf| conf.name == name) @@ -1079,7 +1079,7 @@ fn make_hits( } else { embedding.insert("embedding".to_string(), vector.into()); } - vectors.insert(name.into(), embedding.into()); + vectors.insert(name, embedding.into()); } document.insert("_vectors".into(), vectors.into()); } diff --git a/milli/src/fieldids_weights_map.rs b/milli/src/fieldids_weights_map.rs index 2bf828711..13f2f8afc 100644 --- a/milli/src/fieldids_weights_map.rs +++ b/milli/src/fieldids_weights_map.rs @@ -4,7 +4,8 @@ use std::collections::HashMap; use serde::{Deserialize, Serialize}; -use crate::{vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME, FieldId, FieldsIdsMap, Weight}; +use crate::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME; +use crate::{FieldId, FieldsIdsMap, Weight}; #[derive(Debug, Default, Serialize, Deserialize)] pub struct FieldidsWeightsMap { diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 078010554..ab9ef0525 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -656,7 +656,7 @@ pub(crate) fn write_typed_chunk_into_index( // typed chunks has always at least 1 chunk. let Some((expected_dimension, embedder_name)) = params else { unreachable!() }; - let mut embedding_configs = index.embedding_configs(&wtxn)?; + let mut embedding_configs = index.embedding_configs(wtxn)?; let index_embedder_config = embedding_configs .iter_mut() .find(|IndexEmbeddingConfig { name, .. }| name == &embedder_name) From 400cf3eb92be992e8056d3acddb8ffa086396051 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 5 Jun 2024 11:49:03 +0200 Subject: [PATCH 041/110] add api error test on the new retrieveVectors parameter --- meilisearch/tests/search/errors.rs | 68 +++++++++++++++++++++++++++++ meilisearch/tests/similar/errors.rs | 51 ++++++++++++++++++++++ 2 files changed, 119 insertions(+) diff --git a/meilisearch/tests/search/errors.rs b/meilisearch/tests/search/errors.rs index 53d516c44..75977b190 100644 --- a/meilisearch/tests/search/errors.rs +++ b/meilisearch/tests/search/errors.rs @@ -167,6 +167,74 @@ async fn search_bad_hits_per_page() { "###); } +#[actix_rt::test] +async fn search_bad_attributes_to_retrieve() { + let server = Server::new().await; + let index = server.index("test"); + + let (response, code) = index.search_post(json!({"attributesToRetrieve": "doggo"})).await; + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r###" + { + "message": "Invalid value type at `.attributesToRetrieve`: expected an array, but found a string: `\"doggo\"`", + "code": "invalid_search_attributes_to_retrieve", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_attributes_to_retrieve" + } + "###); + // Can't make the `attributes_to_retrieve` fail with a get search since it'll accept anything as an array of strings. +} + +#[actix_rt::test] +async fn search_bad_retrieve_vectors() { + let server = Server::new().await; + let index = server.index("test"); + + let (response, code) = index.search_post(json!({"retrieveVectors": "doggo"})).await; + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r###" + { + "message": "Invalid value type at `.retrieveVectors`: expected a boolean, but found a string: `\"doggo\"`", + "code": "invalid_search_retrieve_vectors", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_retrieve_vectors" + } + "###); + + let (response, code) = index.search_post(json!({"retrieveVectors": [true]})).await; + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r###" + { + "message": "Invalid value type at `.retrieveVectors`: expected a boolean, but found an array: `[true]`", + "code": "invalid_search_retrieve_vectors", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_retrieve_vectors" + } + "###); + + let (response, code) = index.search_get("retrieveVectors=").await; + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r###" + { + "message": "Invalid value in parameter `retrieveVectors`: could not parse `` as a boolean, expected either `true` or `false`", + "code": "invalid_search_retrieve_vectors", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_retrieve_vectors" + } + "###); + + let (response, code) = index.search_get("retrieveVectors=doggo").await; + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r###" + { + "message": "Invalid value in parameter `retrieveVectors`: could not parse `doggo` as a boolean, expected either `true` or `false`", + "code": "invalid_search_retrieve_vectors", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_retrieve_vectors" + } + "###); +} + #[actix_rt::test] async fn search_bad_attributes_to_crop() { let server = Server::new().await; diff --git a/meilisearch/tests/similar/errors.rs b/meilisearch/tests/similar/errors.rs index 7765b9a85..546554882 100644 --- a/meilisearch/tests/similar/errors.rs +++ b/meilisearch/tests/similar/errors.rs @@ -756,3 +756,54 @@ async fn filter_reserved_geo_point_string() { }) .await; } + +#[actix_rt::test] +async fn similar_bad_retrieve_vectors() { + let server = Server::new().await; + server.set_features(json!({"vectorStore": true})).await; + let index = server.index("test"); + + let (response, code) = index.similar_post(json!({"retrieveVectors": "doggo"})).await; + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r###" + { + "message": "Invalid value type at `.retrieveVectors`: expected a boolean, but found a string: `\"doggo\"`", + "code": "invalid_similar_retrieve_vectors", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_similar_retrieve_vectors" + } + "###); + + let (response, code) = index.similar_post(json!({"retrieveVectors": [true]})).await; + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r###" + { + "message": "Invalid value type at `.retrieveVectors`: expected a boolean, but found an array: `[true]`", + "code": "invalid_similar_retrieve_vectors", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_similar_retrieve_vectors" + } + "###); + + let (response, code) = index.similar_get("retrieveVectors=").await; + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r###" + { + "message": "Invalid value in parameter `retrieveVectors`: could not parse `` as a boolean, expected either `true` or `false`", + "code": "invalid_similar_retrieve_vectors", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_similar_retrieve_vectors" + } + "###); + + let (response, code) = index.similar_get("retrieveVectors=doggo").await; + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r###" + { + "message": "Invalid value in parameter `retrieveVectors`: could not parse `doggo` as a boolean, expected either `true` or `false`", + "code": "invalid_similar_retrieve_vectors", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_similar_retrieve_vectors" + } + "###); +} From 49fa41ce6590862be2b0739c343c0b78861c5d97 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 5 Jun 2024 12:09:14 +0200 Subject: [PATCH 042/110] apply first round of review comments --- index-scheduler/src/lib.rs | 2 -- meilisearch/src/search.rs | 10 ++++------ meilisearch/tests/dumps/mod.rs | 2 +- 3 files changed, 5 insertions(+), 9 deletions(-) diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index 57eccbe66..f98e419a1 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -5172,9 +5172,7 @@ mod tests { snapshot!(snapshot_index_scheduler(&index_scheduler), name: "Intel to kefir"); - println!("HEEEEERE"); handle.advance_one_successful_batch(); - // handle.advance_one_failed_batch(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "Intel to kefir succeeds"); { diff --git a/meilisearch/src/search.rs b/meilisearch/src/search.rs index 9b72ed596..c749dff86 100644 --- a/meilisearch/src/search.rs +++ b/meilisearch/src/search.rs @@ -1073,12 +1073,10 @@ fn make_hits( .is_some_and(|conf| conf.user_defined.contains(id)); let mut embedding = serde_json::Map::new(); embedding.insert("userDefined".to_string(), user_defined.into()); - if vector.len() == 1 { - let vector = vector.pop().unwrap(); - embedding.insert("embedding".to_string(), vector.into()); - } else { - embedding.insert("embedding".to_string(), vector.into()); - } + match vector.as_mut_slice() { + [one] => embedding.insert("embedding".to_string(), std::mem::take(one).into()), + _ => embedding.insert("embedding".to_string(), vector.into()), + }; vectors.insert(name, embedding.into()); } document.insert("_vectors".into(), vectors.into()); diff --git a/meilisearch/tests/dumps/mod.rs b/meilisearch/tests/dumps/mod.rs index dfac2e806..b657fc1ee 100644 --- a/meilisearch/tests/dumps/mod.rs +++ b/meilisearch/tests/dumps/mod.rs @@ -1993,7 +1993,7 @@ async fn generate_and_import_dump_containing_vectors() { let response = index.wait_task(response.uid()).await; snapshot!(response["status"], @r###""succeeded""###); - // ========= We made a dump, now we should clear the DB except and try to import our dump + // ========= We made a dump, now we should clear the DB and try to import our dump drop(server); tokio::fs::remove_dir_all(&opt.db_path).await.unwrap(); let dump_name = format!("{}.dump", response["details"]["dumpUid"].as_str().unwrap()); From b7349910d9285ec485bb59f9712f7d211604b10f Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 5 Jun 2024 15:19:22 +0200 Subject: [PATCH 043/110] implements mor review comments --- index-scheduler/src/lib.rs | 2 +- .../extract/extract_vector_points.rs | 39 +++++++++---------- .../src/update/index_documents/extract/mod.rs | 8 ++-- milli/src/update/index_documents/mod.rs | 8 ++-- .../src/update/index_documents/typed_chunk.rs | 8 ++-- milli/src/vector/rest.rs | 1 - 6 files changed, 31 insertions(+), 35 deletions(-) diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index f98e419a1..1f5a1fdcd 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -5012,7 +5012,7 @@ mod tests { insta::assert_json_snapshot!(task.details); } - handle.advance_n_successful_batches(1); + handle.advance_one_successful_batch(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "settings_update_processed_vectors"); { diff --git a/milli/src/update/index_documents/extract/extract_vector_points.rs b/milli/src/update/index_documents/extract/extract_vector_points.rs index 1e56bec83..88c42864e 100644 --- a/milli/src/update/index_documents/extract/extract_vector_points.rs +++ b/milli/src/update/index_documents/extract/extract_vector_points.rs @@ -35,8 +35,8 @@ pub struct ExtractedVectorPoints { // embedder pub embedder_name: String, pub embedder: Arc, - pub user_defined: RoaringBitmap, - pub remove_from_user_defined: RoaringBitmap, + pub user_provided: RoaringBitmap, + pub remove_from_user_provided: RoaringBitmap, } enum VectorStateDelta { @@ -82,9 +82,9 @@ struct EmbedderVectorExtractor { remove_vectors_writer: Writer>, // The docids of the documents that contains a user defined embedding - user_defined: RoaringBitmap, + user_provided: RoaringBitmap, // The docids of the documents that contains an auto-generated embedding - remove_from_user_defined: RoaringBitmap, + remove_from_user_provided: RoaringBitmap, } /// Extracts the embedding vector contained in each document under the `_vectors` field. @@ -140,8 +140,8 @@ pub fn extract_vector_points( manual_vectors_writer, prompts_writer, remove_vectors_writer, - user_defined: RoaringBitmap::new(), - remove_from_user_defined: RoaringBitmap::new(), + user_provided: RoaringBitmap::new(), + remove_from_user_provided: RoaringBitmap::new(), }); } @@ -179,8 +179,8 @@ pub fn extract_vector_points( manual_vectors_writer, prompts_writer, remove_vectors_writer, - user_defined, - remove_from_user_defined, + user_provided, + remove_from_user_provided, } in extractors.iter_mut() { let delta = match parsed_vectors.remove(embedder_name) { @@ -188,10 +188,10 @@ pub fn extract_vector_points( match (old.map_or(true, |old| old.is_user_provided()), new.is_user_provided()) { (true, true) | (false, false) => (), (true, false) => { - remove_from_user_defined.insert(docid); + remove_from_user_provided.insert(docid); } (false, true) => { - user_defined.insert(docid); + user_provided.insert(docid); } } @@ -214,7 +214,7 @@ pub fn extract_vector_points( .map(|(_, deladd)| KvReaderDelAdd::new(deladd)) .any(|deladd| deladd.get(DelAdd::Addition).is_some()); if document_is_kept && old.is_some() { - remove_from_user_defined.insert(docid); + remove_from_user_provided.insert(docid); // becomes autogenerated VectorStateDelta::NowGenerated(prompt.render( obkv, @@ -229,9 +229,9 @@ pub fn extract_vector_points( } (None, Some(new)) => { if new.is_user_provided() { - user_defined.insert(docid); + user_provided.insert(docid); } else { - remove_from_user_defined.insert(docid); + remove_from_user_provided.insert(docid); } // was possibly autogenerated, remove all vectors for that document let add_vectors = new.into_array_of_vectors(); @@ -274,7 +274,7 @@ pub fn extract_vector_points( VectorStateDelta::NoChange } } else { - remove_from_user_defined.remove(docid); + remove_from_user_provided.remove(docid); VectorStateDelta::NowRemoved } } @@ -301,8 +301,8 @@ pub fn extract_vector_points( manual_vectors_writer, prompts_writer, remove_vectors_writer, - user_defined, - remove_from_user_defined, + user_provided, + remove_from_user_provided, } in extractors { results.push(ExtractedVectorPoints { @@ -311,8 +311,8 @@ pub fn extract_vector_points( prompts: writer_into_reader(prompts_writer)?, embedder, embedder_name, - user_defined, - remove_from_user_defined, + user_provided, + remove_from_user_provided, }) } @@ -347,9 +347,6 @@ fn push_vectors_diff( add_vectors.sort_unstable_by(|a, b| compare_vectors(a, b)); add_vectors.dedup_by(|a, b| compare_vectors(a, b).is_eq()); - // let merged_vectors_iter = - // itertools::merge_join_by(del_vectors, add_vectors, |del, add| compare_vectors(del, add)); - // insert vectors into the writer for (i, vector) in add_vectors.into_iter().enumerate().take(u16::MAX as usize) { // Generate the key by extending the unique index to it. diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 6399b40f8..2babe330f 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -248,8 +248,8 @@ fn send_original_documents_data( prompts, embedder_name, embedder, - user_defined, - remove_from_user_defined: auto_generated, + user_provided, + remove_from_user_provided, } in extracted_vectors { let embeddings = match extract_embeddings( @@ -274,8 +274,8 @@ fn send_original_documents_data( expected_dimension: embedder.dimensions(), manual_vectors, embedder_name, - user_defined, - remove_from_user_defined: auto_generated, + user_provided, + remove_from_user_provided, })); } } diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 907554753..07c10bb45 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -503,8 +503,8 @@ where embeddings, manual_vectors, embedder_name, - user_defined, - remove_from_user_defined, + user_provided, + remove_from_user_provided, } => { dimension.insert(embedder_name.clone(), expected_dimension); TypedChunk::VectorPoints { @@ -513,8 +513,8 @@ where expected_dimension, manual_vectors, embedder_name, - user_defined, - remove_from_user_defined, + user_provided, + remove_from_user_provided, } } otherwise => otherwise, diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index ab9ef0525..05c849809 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -91,8 +91,8 @@ pub(crate) enum TypedChunk { expected_dimension: usize, manual_vectors: grenad::Reader>, embedder_name: String, - user_defined: RoaringBitmap, - remove_from_user_defined: RoaringBitmap, + user_provided: RoaringBitmap, + remove_from_user_provided: RoaringBitmap, }, ScriptLanguageDocids(HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>), } @@ -635,8 +635,8 @@ pub(crate) fn write_typed_chunk_into_index( embeddings, expected_dimension, embedder_name, - user_defined: ud, - remove_from_user_defined: rud, + user_provided: ud, + remove_from_user_provided: rud, } = typed_chunk else { unreachable!(); diff --git a/milli/src/vector/rest.rs b/milli/src/vector/rest.rs index fd771a228..60f54782e 100644 --- a/milli/src/vector/rest.rs +++ b/milli/src/vector/rest.rs @@ -230,7 +230,6 @@ where input_value } [input] => { - dbg!(&options); let mut body = options.query.clone(); body.as_object_mut() From d85ab23b82276a72ff812fab0a32ba70ccf958ec Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 5 Jun 2024 15:38:49 +0200 Subject: [PATCH 044/110] rename all occurences of user_defined to user_provided for consistency --- index-scheduler/src/batch.rs | 2 +- index-scheduler/src/lib.rs | 25 +++++++++--------- meilisearch/src/search.rs | 6 ++--- milli/src/index.rs | 2 +- milli/src/update/index_documents/mod.rs | 4 +-- .../src/update/index_documents/typed_chunk.rs | 12 ++++----- milli/src/update/settings.rs | 26 +++++++++++-------- milli/src/vector/parsed_vectors.rs | 2 +- 8 files changed, 42 insertions(+), 37 deletions(-) diff --git a/index-scheduler/src/batch.rs b/index-scheduler/src/batch.rs index d59a657c9..30ff54a62 100644 --- a/index-scheduler/src/batch.rs +++ b/index-scheduler/src/batch.rs @@ -955,7 +955,7 @@ impl IndexScheduler { let user_provided = embedding_configs .iter() .find(|conf| conf.name == embedder_name) - .is_some_and(|conf| conf.user_defined.contains(id)); + .is_some_and(|conf| conf.user_provided.contains(id)); let embeddings = ExplicitVectors { embeddings: VectorOrArrayOfVectors::from_array_of_vectors( diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index 1f5a1fdcd..8d6237408 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -3062,9 +3062,9 @@ mod tests { let rtxn = index.read_txn().unwrap(); let configs = index.embedding_configs(&rtxn).unwrap(); - let IndexEmbeddingConfig { name, config, user_defined } = configs.first().unwrap(); + let IndexEmbeddingConfig { name, config, user_provided } = configs.first().unwrap(); insta::assert_snapshot!(name, @"default"); - insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[]>"); + insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>"); insta::assert_json_snapshot!(config.embedder_options); } @@ -5029,17 +5029,17 @@ mod tests { let configs = index.embedding_configs(&rtxn).unwrap(); // for consistency with the below #[allow(clippy::get_first)] - let IndexEmbeddingConfig { name, config: fakerest_config, user_defined } = + let IndexEmbeddingConfig { name, config: fakerest_config, user_provided } = configs.get(0).unwrap(); insta::assert_snapshot!(name, @"A_fakerest"); - insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[]>"); + insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>"); insta::assert_json_snapshot!(fakerest_config.embedder_options); let fakerest_name = name.clone(); - let IndexEmbeddingConfig { name, config: simple_hf_config, user_defined } = + let IndexEmbeddingConfig { name, config: simple_hf_config, user_provided } = configs.get(1).unwrap(); insta::assert_snapshot!(name, @"B_small_hf"); - insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[]>"); + insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>"); insta::assert_json_snapshot!(simple_hf_config.embedder_options); let simple_hf_name = name.clone(); @@ -5111,13 +5111,14 @@ mod tests { let configs = index.embedding_configs(&rtxn).unwrap(); // for consistency with the below #[allow(clippy::get_first)] - let IndexEmbeddingConfig { name, config: _, user_defined } = configs.get(0).unwrap(); + let IndexEmbeddingConfig { name, config: _, user_provided: user_defined } = + configs.get(0).unwrap(); insta::assert_snapshot!(name, @"A_fakerest"); insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[0]>"); - let IndexEmbeddingConfig { name, config: _, user_defined } = configs.get(1).unwrap(); + let IndexEmbeddingConfig { name, config: _, user_provided } = configs.get(1).unwrap(); insta::assert_snapshot!(name, @"B_small_hf"); - insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[]>"); + insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>"); let embeddings = index.embeddings(&rtxn, 0).unwrap(); @@ -5185,15 +5186,15 @@ mod tests { let configs = index.embedding_configs(&rtxn).unwrap(); // for consistency with the below #[allow(clippy::get_first)] - let IndexEmbeddingConfig { name, config: _, user_defined } = + let IndexEmbeddingConfig { name, config: _, user_provided: user_defined } = configs.get(0).unwrap(); insta::assert_snapshot!(name, @"A_fakerest"); insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[0]>"); - let IndexEmbeddingConfig { name, config: _, user_defined } = + let IndexEmbeddingConfig { name, config: _, user_provided } = configs.get(1).unwrap(); insta::assert_snapshot!(name, @"B_small_hf"); - insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[]>"); + insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>"); let embeddings = index.embeddings(&rtxn, 0).unwrap(); diff --git a/meilisearch/src/search.rs b/meilisearch/src/search.rs index c749dff86..a0c05b09e 100644 --- a/meilisearch/src/search.rs +++ b/meilisearch/src/search.rs @@ -1067,12 +1067,12 @@ fn make_hits( if retrieve_vectors { let mut vectors = serde_json::Map::new(); for (name, mut vector) in index.embeddings(rtxn, id)? { - let user_defined = embedding_configs + let user_provided = embedding_configs .iter() .find(|conf| conf.name == name) - .is_some_and(|conf| conf.user_defined.contains(id)); + .is_some_and(|conf| conf.user_provided.contains(id)); let mut embedding = serde_json::Map::new(); - embedding.insert("userDefined".to_string(), user_defined.into()); + embedding.insert("userProvided".to_string(), user_provided.into()); match vector.as_mut_slice() { [one] => embedding.insert("embedding".to_string(), std::mem::take(one).into()), _ => embedding.insert("embedding".to_string(), vector.into()), diff --git a/milli/src/index.rs b/milli/src/index.rs index a47c07e08..d325d6fa4 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -1672,7 +1672,7 @@ impl Index { pub struct IndexEmbeddingConfig { pub name: String, pub config: EmbeddingConfig, - pub user_defined: RoaringBitmap, + pub user_provided: RoaringBitmap, } #[cfg(test)] diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 07c10bb45..a533f1984 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -2623,10 +2623,10 @@ mod tests { let rtxn = index.read_txn().unwrap(); let mut embedding_configs = index.embedding_configs(&rtxn).unwrap(); - let IndexEmbeddingConfig { name: embedder_name, config: embedder, user_defined } = + let IndexEmbeddingConfig { name: embedder_name, config: embedder, user_provided } = embedding_configs.pop().unwrap(); insta::assert_snapshot!(embedder_name, @"manual"); - insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[0, 1, 2]>"); + insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[0, 1, 2]>"); let embedder = std::sync::Arc::new(crate::vector::Embedder::new(embedder.embedder_options).unwrap()); let res = index diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 05c849809..0cb5e58af 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -625,8 +625,8 @@ pub(crate) fn write_typed_chunk_into_index( let mut remove_vectors_builder = MergerBuilder::new(keep_first as MergeFn); let mut manual_vectors_builder = MergerBuilder::new(keep_first as MergeFn); let mut embeddings_builder = MergerBuilder::new(keep_first as MergeFn); - let mut user_defined = RoaringBitmap::new(); - let mut remove_from_user_defined = RoaringBitmap::new(); + let mut user_provided = RoaringBitmap::new(); + let mut remove_from_user_provided = RoaringBitmap::new(); let mut params = None; for typed_chunk in typed_chunks { let TypedChunk::VectorPoints { @@ -649,8 +649,8 @@ pub(crate) fn write_typed_chunk_into_index( if let Some(embeddings) = embeddings { embeddings_builder.push(embeddings.into_cursor()?); } - user_defined |= ud; - remove_from_user_defined |= rud; + user_provided |= ud; + remove_from_user_provided |= rud; } // typed chunks has always at least 1 chunk. @@ -661,8 +661,8 @@ pub(crate) fn write_typed_chunk_into_index( .iter_mut() .find(|IndexEmbeddingConfig { name, .. }| name == &embedder_name) .unwrap(); - index_embedder_config.user_defined -= remove_from_user_defined; - index_embedder_config.user_defined |= user_defined; + index_embedder_config.user_provided -= remove_from_user_provided; + index_embedder_config.user_provided |= user_provided; index.put_embedding_configs(wtxn, embedding_configs)?; diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 6b07e614e..08b12d178 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -932,9 +932,11 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { let old_configs: BTreeMap, RoaringBitmap)> = old_configs .into_iter() - .map(|IndexEmbeddingConfig { name, config, user_defined }| { - (name, (Setting::Set(config.into()), user_defined)) - }) + .map( + |IndexEmbeddingConfig { name, config, user_provided: user_defined }| { + (name, (Setting::Set(config.into()), user_defined)) + }, + ) .collect(); let mut new_configs = BTreeMap::new(); @@ -944,19 +946,19 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { { match joined { // updated config - EitherOrBoth::Both((name, (mut old, user_defined)), (_, new)) => { + EitherOrBoth::Both((name, (mut old, user_provided)), (_, new)) => { changed |= EmbeddingSettings::apply_and_need_reindex(&mut old, new); if changed { tracing::debug!( embedder = name, - documents = user_defined.len(), + user_provided = user_provided.len(), "need reindex" ); } else { tracing::debug!(embedder = name, "skip reindex"); } let new = validate_embedding_settings(old, &name)?; - new_configs.insert(name, (new, user_defined)); + new_configs.insert(name, (new, user_provided)); } // unchanged config EitherOrBoth::Left((name, setting)) => { @@ -979,15 +981,17 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { } let new_configs: Vec = new_configs .into_iter() - .filter_map(|(name, (config, user_defined))| match config { - Setting::Set(config) => { - Some(IndexEmbeddingConfig { name, config: config.into(), user_defined }) - } + .filter_map(|(name, (config, user_provided))| match config { + Setting::Set(config) => Some(IndexEmbeddingConfig { + name, + config: config.into(), + user_provided, + }), Setting::Reset => None, Setting::NotSet => Some(IndexEmbeddingConfig { name, config: EmbeddingSettings::default().into(), - user_defined, + user_provided, }), }) .collect(); diff --git a/milli/src/vector/parsed_vectors.rs b/milli/src/vector/parsed_vectors.rs index 672e27cc5..4e9e60520 100644 --- a/milli/src/vector/parsed_vectors.rs +++ b/milli/src/vector/parsed_vectors.rs @@ -73,7 +73,7 @@ impl ParsedVectorsDiff { } .flatten().map_or(BTreeMap::default(), |del| del.into_iter().map(|(name, vec)| (name, Some(vec))).collect()); for embedding_config in embedders_configs { - if embedding_config.user_defined.contains(docid) { + if embedding_config.user_provided.contains(docid) { old.entry(embedding_config.name.to_string()).or_insert(None); } } From 31a793d226154dcecff10f8e761582b775665dd5 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 5 Jun 2024 16:10:56 +0200 Subject: [PATCH 045/110] fix the regeneration of the embeddings in the search --- index-scheduler/src/lib.rs | 2 +- meilisearch/src/search.rs | 12 +- meilisearch/tests/dumps/mod.rs | 25 +-- meilisearch/tests/search/hybrid.rs | 30 +-- meilisearch/tests/search/mod.rs | 70 +++--- meilisearch/tests/similar/mod.rs | 336 ++++++++++++++++------------- milli/src/vector/parsed_vectors.rs | 16 ++ 7 files changed, 281 insertions(+), 210 deletions(-) diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index 8d6237408..de263c50d 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -5366,7 +5366,7 @@ mod tests { template: "{{doc.doggo}}", }, }, - user_defined: RoaringBitmap<[1, 2]>, + user_provided: RoaringBitmap<[1, 2]>, }, ] "###); diff --git a/meilisearch/src/search.rs b/meilisearch/src/search.rs index a0c05b09e..ce712f17f 100644 --- a/meilisearch/src/search.rs +++ b/meilisearch/src/search.rs @@ -15,6 +15,7 @@ use meilisearch_types::error::{Code, ResponseError}; use meilisearch_types::heed::RoTxn; use meilisearch_types::index_uid::IndexUid; use meilisearch_types::milli::score_details::{ScoreDetails, ScoringStrategy}; +use meilisearch_types::milli::vector::parsed_vectors::ExplicitVectors; use meilisearch_types::milli::vector::Embedder; use meilisearch_types::milli::{FacetValueHit, OrderBy, SearchForFacetValues, TimeBudget}; use meilisearch_types::settings::DEFAULT_PAGINATION_MAX_TOTAL_HITS; @@ -1066,18 +1067,13 @@ fn make_hits( if retrieve_vectors { let mut vectors = serde_json::Map::new(); - for (name, mut vector) in index.embeddings(rtxn, id)? { + for (name, vector) in index.embeddings(rtxn, id)? { let user_provided = embedding_configs .iter() .find(|conf| conf.name == name) .is_some_and(|conf| conf.user_provided.contains(id)); - let mut embedding = serde_json::Map::new(); - embedding.insert("userProvided".to_string(), user_provided.into()); - match vector.as_mut_slice() { - [one] => embedding.insert("embedding".to_string(), std::mem::take(one).into()), - _ => embedding.insert("embedding".to_string(), vector.into()), - }; - vectors.insert(name, embedding.into()); + let embeddings = ExplicitVectors { embeddings: vector.into(), user_provided }; + vectors.insert(name, serde_json::to_value(embeddings)?); } document.insert("_vectors".into(), vectors.into()); } diff --git a/meilisearch/tests/dumps/mod.rs b/meilisearch/tests/dumps/mod.rs index b657fc1ee..6f93d94a7 100644 --- a/meilisearch/tests/dumps/mod.rs +++ b/meilisearch/tests/dumps/mod.rs @@ -1940,8 +1940,9 @@ async fn import_dump_v6_containing_experimental_features() { } // In this test we must generate the dump ourselves to ensure the -// `user defined` vectors are well set +// `user provided` vectors are well set #[actix_rt::test] +#[cfg_attr(target_os = "windows", ignore)] async fn generate_and_import_dump_containing_vectors() { let temp = tempfile::tempdir().unwrap(); let mut opt = default_settings(temp.path()); @@ -2087,15 +2088,15 @@ async fn generate_and_import_dump_containing_vectors() { index .search(json!({"retrieveVectors": true}), |response, code| { snapshot!(code, @"200 OK"); - snapshot!(json_string!(response["hits"], { "[]._vectors.doggo_embedder.embedding" => "[vector]" }), @r###" + snapshot!(json_string!(response["hits"], { "[]._vectors.doggo_embedder.embeddings" => "[vector]" }), @r###" [ { "id": 0, "doggo": "kefir", "_vectors": { "doggo_embedder": { - "userDefined": true, - "embedding": "[vector]" + "embeddings": "[vector]", + "userProvided": true } } }, @@ -2104,8 +2105,8 @@ async fn generate_and_import_dump_containing_vectors() { "doggo": "echo", "_vectors": { "doggo_embedder": { - "userDefined": true, - "embedding": "[vector]" + "embeddings": "[vector]", + "userProvided": true } } }, @@ -2114,8 +2115,8 @@ async fn generate_and_import_dump_containing_vectors() { "doggo": "intel", "_vectors": { "doggo_embedder": { - "userDefined": false, - "embedding": "[vector]" + "embeddings": "[vector]", + "userProvided": false } } }, @@ -2124,8 +2125,8 @@ async fn generate_and_import_dump_containing_vectors() { "doggo": "bill", "_vectors": { "doggo_embedder": { - "userDefined": false, - "embedding": "[vector]" + "embeddings": "[vector]", + "userProvided": false } } }, @@ -2134,8 +2135,8 @@ async fn generate_and_import_dump_containing_vectors() { "doggo": "max", "_vectors": { "doggo_embedder": { - "userDefined": false, - "embedding": "[vector]" + "embeddings": "[vector]", + "userProvided": false } } } diff --git a/meilisearch/tests/search/hybrid.rs b/meilisearch/tests/search/hybrid.rs index 713dbe3bb..b8a4110ad 100644 --- a/meilisearch/tests/search/hybrid.rs +++ b/meilisearch/tests/search/hybrid.rs @@ -128,7 +128,7 @@ async fn simple_search() { ) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"userDefined":true,"embedding":[1.0,2.0]}}},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"userDefined":true,"embedding":[2.0,3.0]}}},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"userDefined":true,"embedding":[1.0,3.0]}}}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"userProvided":true}}},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"userProvided":true}}},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"userProvided":true}}}]"###); snapshot!(response["semanticHitCount"], @"0"); let (response, code) = index @@ -137,7 +137,7 @@ async fn simple_search() { ) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"userDefined":true,"embedding":[2.0,3.0]}},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"userDefined":true,"embedding":[1.0,2.0]}},"_rankingScore":0.9848484848484848},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"userDefined":true,"embedding":[1.0,3.0]}},"_rankingScore":0.9472135901451112}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"userProvided":true}},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"userProvided":true}},"_rankingScore":0.9848484848484848},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"userProvided":true}},"_rankingScore":0.9472135901451112}]"###); snapshot!(response["semanticHitCount"], @"2"); let (response, code) = index @@ -146,7 +146,7 @@ async fn simple_search() { ) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"userDefined":true,"embedding":[2.0,3.0]}},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"userDefined":true,"embedding":[1.0,2.0]}},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"userDefined":true,"embedding":[1.0,3.0]}},"_rankingScore":0.9472135901451112}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"userProvided":true}},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"userProvided":true}},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"userProvided":true}},"_rankingScore":0.9472135901451112}]"###); snapshot!(response["semanticHitCount"], @"3"); } @@ -207,7 +207,7 @@ async fn distribution_shift() { let search = json!({"q": "Captain", "vector": [1.0, 1.0], "showRankingScore": true, "hybrid": {"semanticRatio": 1.0}, "retrieveVectors": true}); let (response, code) = index.search_post(search.clone()).await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"userDefined":true,"embedding":[2.0,3.0]}},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"userDefined":true,"embedding":[1.0,2.0]}},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"userDefined":true,"embedding":[1.0,3.0]}},"_rankingScore":0.9472135901451112}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"userProvided":true}},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"userProvided":true}},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"userProvided":true}},"_rankingScore":0.9472135901451112}]"###); let (response, code) = index .update_settings(json!({ @@ -228,7 +228,7 @@ async fn distribution_shift() { let (response, code) = index.search_post(search).await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"userDefined":true,"embedding":[2.0,3.0]}},"_rankingScore":0.19161224365234375},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"userDefined":true,"embedding":[1.0,2.0]}},"_rankingScore":1.1920928955078125e-7},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"userDefined":true,"embedding":[1.0,3.0]}},"_rankingScore":1.1920928955078125e-7}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"userProvided":true}},"_rankingScore":0.19161224365234375},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"userProvided":true}},"_rankingScore":1.1920928955078125e-7},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"userProvided":true}},"_rankingScore":1.1920928955078125e-7}]"###); } #[actix_rt::test] @@ -249,7 +249,7 @@ async fn highlighter() { })) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"userDefined":true,"embedding":[2.0,3.0]}},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"}},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"userDefined":true,"embedding":[1.0,3.0]}},"_formatted":{"title":"Shazam!","desc":"a **BEGIN**Captain**END** **BEGIN**Marvel**END** ersatz","id":"1"}},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"userDefined":true,"embedding":[1.0,2.0]}},"_formatted":{"title":"Captain Planet","desc":"He's not part of the **BEGIN**Marvel**END** Cinematic Universe","id":"2"}}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"userProvided":true}},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"}},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"userProvided":true}},"_formatted":{"title":"Shazam!","desc":"a **BEGIN**Captain**END** **BEGIN**Marvel**END** ersatz","id":"1"}},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"userProvided":true}},"_formatted":{"title":"Captain Planet","desc":"He's not part of the **BEGIN**Marvel**END** Cinematic Universe","id":"2"}}]"###); snapshot!(response["semanticHitCount"], @"0"); let (response, code) = index @@ -265,7 +265,7 @@ async fn highlighter() { })) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"userDefined":true,"embedding":[2.0,3.0]}},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"userDefined":true,"embedding":[1.0,2.0]}},"_formatted":{"title":"Captain Planet","desc":"He's not part of the **BEGIN**Marvel**END** Cinematic Universe","id":"2"},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"userDefined":true,"embedding":[1.0,3.0]}},"_formatted":{"title":"Shazam!","desc":"a **BEGIN**Captain**END** **BEGIN**Marvel**END** ersatz","id":"1"},"_rankingScore":0.9472135901451112}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"userProvided":true}},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"userProvided":true}},"_formatted":{"title":"Captain Planet","desc":"He's not part of the **BEGIN**Marvel**END** Cinematic Universe","id":"2"},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"userProvided":true}},"_formatted":{"title":"Shazam!","desc":"a **BEGIN**Captain**END** **BEGIN**Marvel**END** ersatz","id":"1"},"_rankingScore":0.9472135901451112}]"###); snapshot!(response["semanticHitCount"], @"3"); // no highlighting on full semantic @@ -282,7 +282,7 @@ async fn highlighter() { })) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"userDefined":true,"embedding":[2.0,3.0]}},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"userDefined":true,"embedding":[1.0,2.0]}},"_formatted":{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2"},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"userDefined":true,"embedding":[1.0,3.0]}},"_formatted":{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1"},"_rankingScore":0.9472135901451112}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"userProvided":true}},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"userProvided":true}},"_formatted":{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2"},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"userProvided":true}},"_formatted":{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1"},"_rankingScore":0.9472135901451112}]"###); snapshot!(response["semanticHitCount"], @"3"); } @@ -370,7 +370,7 @@ async fn single_document() { .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"][0], @r###"{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"userDefined":true,"embedding":[1.0,3.0]}},"_rankingScore":1.0}"###); + snapshot!(response["hits"][0], @r###"{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"userProvided":true}},"_rankingScore":1.0}"###); snapshot!(response["semanticHitCount"], @"1"); } @@ -385,7 +385,7 @@ async fn query_combination() { .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"userDefined":true,"embedding":[1.0,3.0]}},"_rankingScore":1.0},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"userDefined":true,"embedding":[1.0,2.0]}},"_rankingScore":1.0},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"userDefined":true,"embedding":[2.0,3.0]}},"_rankingScore":1.0}]"###); + snapshot!(response["hits"], @r###"[{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"userProvided":true}},"_rankingScore":1.0},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"userProvided":true}},"_rankingScore":1.0},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"userProvided":true}},"_rankingScore":1.0}]"###); snapshot!(response["semanticHitCount"], @"null"); // same with a different semantic ratio @@ -394,7 +394,7 @@ async fn query_combination() { .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"userDefined":true,"embedding":[1.0,3.0]}},"_rankingScore":1.0},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"userDefined":true,"embedding":[1.0,2.0]}},"_rankingScore":1.0},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"userDefined":true,"embedding":[2.0,3.0]}},"_rankingScore":1.0}]"###); + snapshot!(response["hits"], @r###"[{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"userProvided":true}},"_rankingScore":1.0},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"userProvided":true}},"_rankingScore":1.0},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"userProvided":true}},"_rankingScore":1.0}]"###); snapshot!(response["semanticHitCount"], @"null"); // wrong vector dimensions @@ -418,7 +418,7 @@ async fn query_combination() { .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"userDefined":true,"embedding":[2.0,3.0]}},"_rankingScore":0.7773500680923462},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"userDefined":true,"embedding":[1.0,2.0]}},"_rankingScore":0.7236068248748779},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"userDefined":true,"embedding":[1.0,3.0]}},"_rankingScore":0.6581138968467712}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"userProvided":true}},"_rankingScore":0.7773500680923462},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"userProvided":true}},"_rankingScore":0.7236068248748779},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"userProvided":true}},"_rankingScore":0.6581138968467712}]"###); snapshot!(response["semanticHitCount"], @"3"); // full keyword, without a query @@ -427,7 +427,7 @@ async fn query_combination() { .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"userDefined":true,"embedding":[1.0,3.0]}},"_rankingScore":1.0},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"userDefined":true,"embedding":[1.0,2.0]}},"_rankingScore":1.0},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"userDefined":true,"embedding":[2.0,3.0]}},"_rankingScore":1.0}]"###); + snapshot!(response["hits"], @r###"[{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"userProvided":true}},"_rankingScore":1.0},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"userProvided":true}},"_rankingScore":1.0},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"userProvided":true}},"_rankingScore":1.0}]"###); snapshot!(response["semanticHitCount"], @"null"); // query + vector, full keyword => keyword @@ -436,7 +436,7 @@ async fn query_combination() { .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"userDefined":true,"embedding":[1.0,2.0]}},"_rankingScore":0.9848484848484848},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"userDefined":true,"embedding":[2.0,3.0]}},"_rankingScore":0.9848484848484848},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"userDefined":true,"embedding":[1.0,3.0]}},"_rankingScore":0.9242424242424242}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"userProvided":true}},"_rankingScore":0.9848484848484848},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"userProvided":true}},"_rankingScore":0.9848484848484848},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"userProvided":true}},"_rankingScore":0.9242424242424242}]"###); snapshot!(response["semanticHitCount"], @"null"); // query + vector, no hybrid keyword => @@ -479,6 +479,6 @@ async fn query_combination() { .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"userDefined":true,"embedding":[1.0,2.0]}},"_rankingScore":0.9242424242424242}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"userProvided":true}},"_rankingScore":0.9242424242424242}]"###); snapshot!(response["semanticHitCount"], @"0"); } diff --git a/meilisearch/tests/search/mod.rs b/meilisearch/tests/search/mod.rs index 2a2b23fd5..9e19fa4e8 100644 --- a/meilisearch/tests/search/mod.rs +++ b/meilisearch/tests/search/mod.rs @@ -1350,12 +1350,14 @@ async fn experimental_feature_vector_store() { "id": "287947", "_vectors": { "manual": { - "userDefined": true, - "embedding": [ - 1.0, - 2.0, - 3.0 - ] + "embeddings": [ + [ + 1.0, + 2.0, + 3.0 + ] + ], + "userProvided": true } }, "_rankingScore": 1.0 @@ -1365,12 +1367,14 @@ async fn experimental_feature_vector_store() { "id": "299537", "_vectors": { "manual": { - "userDefined": true, - "embedding": [ - 1.0, - 2.0, - 54.0 - ] + "embeddings": [ + [ + 1.0, + 2.0, + 54.0 + ] + ], + "userProvided": true } }, "_rankingScore": 0.9129111766815186 @@ -1380,12 +1384,14 @@ async fn experimental_feature_vector_store() { "id": "450465", "_vectors": { "manual": { - "userDefined": true, - "embedding": [ - -100.0, - 340.0, - 90.0 - ] + "embeddings": [ + [ + -100.0, + 340.0, + 90.0 + ] + ], + "userProvided": true } }, "_rankingScore": 0.8106412887573242 @@ -1395,12 +1401,14 @@ async fn experimental_feature_vector_store() { "id": "166428", "_vectors": { "manual": { - "userDefined": true, - "embedding": [ - -100.0, - 231.0, - 32.0 - ] + "embeddings": [ + [ + -100.0, + 231.0, + 32.0 + ] + ], + "userProvided": true } }, "_rankingScore": 0.7412010431289673 @@ -1410,12 +1418,14 @@ async fn experimental_feature_vector_store() { "id": "522681", "_vectors": { "manual": { - "userDefined": true, - "embedding": [ - 10.0, - -23.0, - 32.0 - ] + "embeddings": [ + [ + 10.0, + -23.0, + 32.0 + ] + ], + "userProvided": true } }, "_rankingScore": 0.6972063183784485 diff --git a/meilisearch/tests/similar/mod.rs b/meilisearch/tests/similar/mod.rs index 2b70b3df5..0a568553c 100644 --- a/meilisearch/tests/similar/mod.rs +++ b/meilisearch/tests/similar/mod.rs @@ -88,12 +88,14 @@ async fn basic() { "id": "522681", "_vectors": { "manual": { - "userDefined": true, - "embedding": [ - 0.10000000149011612, - 0.6000000238418579, - 0.800000011920929 - ] + "embeddings": [ + [ + 0.10000000149011612, + 0.6000000238418579, + 0.800000011920929 + ] + ], + "userProvided": true } } }, @@ -103,12 +105,14 @@ async fn basic() { "id": "299537", "_vectors": { "manual": { - "userDefined": true, - "embedding": [ - 0.6000000238418579, - 0.800000011920929, - -0.20000000298023224 - ] + "embeddings": [ + [ + 0.6000000238418579, + 0.800000011920929, + -0.20000000298023224 + ] + ], + "userProvided": true } } }, @@ -118,12 +122,14 @@ async fn basic() { "id": "166428", "_vectors": { "manual": { - "userDefined": true, - "embedding": [ - 0.699999988079071, - 0.699999988079071, - -0.4000000059604645 - ] + "embeddings": [ + [ + 0.699999988079071, + 0.699999988079071, + -0.4000000059604645 + ] + ], + "userProvided": true } } }, @@ -133,12 +139,14 @@ async fn basic() { "id": "287947", "_vectors": { "manual": { - "userDefined": true, - "embedding": [ - 0.800000011920929, - 0.4000000059604645, - -0.5 - ] + "embeddings": [ + [ + 0.800000011920929, + 0.4000000059604645, + -0.5 + ] + ], + "userProvided": true } } } @@ -158,12 +166,14 @@ async fn basic() { "id": "166428", "_vectors": { "manual": { - "userDefined": true, - "embedding": [ - 0.699999988079071, - 0.699999988079071, - -0.4000000059604645 - ] + "embeddings": [ + [ + 0.699999988079071, + 0.699999988079071, + -0.4000000059604645 + ] + ], + "userProvided": true } } }, @@ -173,12 +183,14 @@ async fn basic() { "id": "287947", "_vectors": { "manual": { - "userDefined": true, - "embedding": [ - 0.800000011920929, - 0.4000000059604645, - -0.5 - ] + "embeddings": [ + [ + 0.800000011920929, + 0.4000000059604645, + -0.5 + ] + ], + "userProvided": true } } }, @@ -188,12 +200,14 @@ async fn basic() { "id": "522681", "_vectors": { "manual": { - "userDefined": true, - "embedding": [ - 0.10000000149011612, - 0.6000000238418579, - 0.800000011920929 - ] + "embeddings": [ + [ + 0.10000000149011612, + 0.6000000238418579, + 0.800000011920929 + ] + ], + "userProvided": true } } }, @@ -203,12 +217,14 @@ async fn basic() { "id": "143", "_vectors": { "manual": { - "userDefined": true, - "embedding": [ - -0.5, - 0.30000001192092896, - 0.8500000238418579 - ] + "embeddings": [ + [ + -0.5, + 0.30000001192092896, + 0.8500000238418579 + ] + ], + "userProvided": true } } } @@ -264,12 +280,14 @@ async fn ranking_score_threshold() { "id": "522681", "_vectors": { "manual": { - "userDefined": true, - "embedding": [ - 0.10000000149011612, - 0.6000000238418579, - 0.800000011920929 - ] + "embeddings": [ + [ + 0.10000000149011612, + 0.6000000238418579, + 0.800000011920929 + ] + ], + "userProvided": true } }, "_rankingScore": 0.890957772731781 @@ -280,12 +298,14 @@ async fn ranking_score_threshold() { "id": "299537", "_vectors": { "manual": { - "userDefined": true, - "embedding": [ - 0.6000000238418579, - 0.800000011920929, - -0.20000000298023224 - ] + "embeddings": [ + [ + 0.6000000238418579, + 0.800000011920929, + -0.20000000298023224 + ] + ], + "userProvided": true } }, "_rankingScore": 0.39060014486312866 @@ -296,12 +316,14 @@ async fn ranking_score_threshold() { "id": "166428", "_vectors": { "manual": { - "userDefined": true, - "embedding": [ - 0.699999988079071, - 0.699999988079071, - -0.4000000059604645 - ] + "embeddings": [ + [ + 0.699999988079071, + 0.699999988079071, + -0.4000000059604645 + ] + ], + "userProvided": true } }, "_rankingScore": 0.2819308042526245 @@ -312,12 +334,14 @@ async fn ranking_score_threshold() { "id": "287947", "_vectors": { "manual": { - "userDefined": true, - "embedding": [ - 0.800000011920929, - 0.4000000059604645, - -0.5 - ] + "embeddings": [ + [ + 0.800000011920929, + 0.4000000059604645, + -0.5 + ] + ], + "userProvided": true } }, "_rankingScore": 0.1662663221359253 @@ -342,12 +366,14 @@ async fn ranking_score_threshold() { "id": "522681", "_vectors": { "manual": { - "userDefined": true, - "embedding": [ - 0.10000000149011612, - 0.6000000238418579, - 0.800000011920929 - ] + "embeddings": [ + [ + 0.10000000149011612, + 0.6000000238418579, + 0.800000011920929 + ] + ], + "userProvided": true } }, "_rankingScore": 0.890957772731781 @@ -358,12 +384,14 @@ async fn ranking_score_threshold() { "id": "299537", "_vectors": { "manual": { - "userDefined": true, - "embedding": [ - 0.6000000238418579, - 0.800000011920929, - -0.20000000298023224 - ] + "embeddings": [ + [ + 0.6000000238418579, + 0.800000011920929, + -0.20000000298023224 + ] + ], + "userProvided": true } }, "_rankingScore": 0.39060014486312866 @@ -374,12 +402,14 @@ async fn ranking_score_threshold() { "id": "166428", "_vectors": { "manual": { - "userDefined": true, - "embedding": [ - 0.699999988079071, - 0.699999988079071, - -0.4000000059604645 - ] + "embeddings": [ + [ + 0.699999988079071, + 0.699999988079071, + -0.4000000059604645 + ] + ], + "userProvided": true } }, "_rankingScore": 0.2819308042526245 @@ -404,12 +434,14 @@ async fn ranking_score_threshold() { "id": "522681", "_vectors": { "manual": { - "userDefined": true, - "embedding": [ - 0.10000000149011612, - 0.6000000238418579, - 0.800000011920929 - ] + "embeddings": [ + [ + 0.10000000149011612, + 0.6000000238418579, + 0.800000011920929 + ] + ], + "userProvided": true } }, "_rankingScore": 0.890957772731781 @@ -420,12 +452,14 @@ async fn ranking_score_threshold() { "id": "299537", "_vectors": { "manual": { - "userDefined": true, - "embedding": [ - 0.6000000238418579, - 0.800000011920929, - -0.20000000298023224 - ] + "embeddings": [ + [ + 0.6000000238418579, + 0.800000011920929, + -0.20000000298023224 + ] + ], + "userProvided": true } }, "_rankingScore": 0.39060014486312866 @@ -450,12 +484,14 @@ async fn ranking_score_threshold() { "id": "522681", "_vectors": { "manual": { - "userDefined": true, - "embedding": [ - 0.10000000149011612, - 0.6000000238418579, - 0.800000011920929 - ] + "embeddings": [ + [ + 0.10000000149011612, + 0.6000000238418579, + 0.800000011920929 + ] + ], + "userProvided": true } }, "_rankingScore": 0.890957772731781 @@ -522,12 +558,14 @@ async fn filter() { "id": "299537", "_vectors": { "manual": { - "userDefined": true, - "embedding": [ - 0.6000000238418579, - 0.800000011920929, - -0.20000000298023224 - ] + "embeddings": [ + [ + 0.6000000238418579, + 0.800000011920929, + -0.20000000298023224 + ] + ], + "userProvided": true } } }, @@ -537,12 +575,14 @@ async fn filter() { "id": "166428", "_vectors": { "manual": { - "userDefined": true, - "embedding": [ - 0.699999988079071, - 0.699999988079071, - -0.4000000059604645 - ] + "embeddings": [ + [ + 0.699999988079071, + 0.699999988079071, + -0.4000000059604645 + ] + ], + "userProvided": true } } }, @@ -552,12 +592,14 @@ async fn filter() { "id": "287947", "_vectors": { "manual": { - "userDefined": true, - "embedding": [ - 0.800000011920929, - 0.4000000059604645, - -0.5 - ] + "embeddings": [ + [ + 0.800000011920929, + 0.4000000059604645, + -0.5 + ] + ], + "userProvided": true } } } @@ -580,12 +622,14 @@ async fn filter() { "id": "143", "_vectors": { "manual": { - "userDefined": true, - "embedding": [ - -0.5, - 0.30000001192092896, - 0.8500000238418579 - ] + "embeddings": [ + [ + -0.5, + 0.30000001192092896, + 0.8500000238418579 + ] + ], + "userProvided": true } } } @@ -639,12 +683,14 @@ async fn limit_and_offset() { "id": "522681", "_vectors": { "manual": { - "userDefined": true, - "embedding": [ - 0.10000000149011612, - 0.6000000238418579, - 0.800000011920929 - ] + "embeddings": [ + [ + 0.10000000149011612, + 0.6000000238418579, + 0.800000011920929 + ] + ], + "userProvided": true } } } @@ -666,12 +712,14 @@ async fn limit_and_offset() { "id": "299537", "_vectors": { "manual": { - "userDefined": true, - "embedding": [ - 0.6000000238418579, - 0.800000011920929, - -0.20000000298023224 - ] + "embeddings": [ + [ + 0.6000000238418579, + 0.800000011920929, + -0.20000000298023224 + ] + ], + "userProvided": true } } } diff --git a/milli/src/vector/parsed_vectors.rs b/milli/src/vector/parsed_vectors.rs index 4e9e60520..501bd2ad2 100644 --- a/milli/src/vector/parsed_vectors.rs +++ b/milli/src/vector/parsed_vectors.rs @@ -160,6 +160,22 @@ impl VectorOrArrayOfVectors { pub fn from_array_of_vectors(array_of_vec: Vec) -> Self { Self { inner: Some(either::Either::Left(array_of_vec)) } } + + pub fn from_vector(vec: Embedding) -> Self { + Self { inner: Some(either::Either::Right(vec)) } + } +} + +impl From for VectorOrArrayOfVectors { + fn from(vec: Embedding) -> Self { + Self::from_vector(vec) + } +} + +impl From> for VectorOrArrayOfVectors { + fn from(vec: Vec) -> Self { + Self::from_array_of_vectors(vec) + } } #[cfg(test)] From ea61e5cbec610b8025dd3448162a5bb769e603d0 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 5 Jun 2024 16:26:03 +0200 Subject: [PATCH 046/110] makes clippy happy x2 --- index-scheduler/src/lib.rs | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index de263c50d..50fc619d8 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -5271,7 +5271,7 @@ mod tests { ] ); - let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0 as u128).unwrap(); + let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0_u128).unwrap(); let documents_count = read_json(serde_json::to_string_pretty(&content).unwrap().as_bytes(), &mut file) .unwrap(); @@ -5307,16 +5307,18 @@ mod tests { .collect::>(); snapshot!(serde_json::to_string(&documents).unwrap(), name: "documents after initial push"); - let mut setting = meilisearch_types::settings::Settings::::default(); - setting.embedders = Setting::Set(maplit::btreemap! { - S("my_doggo_embedder") => Setting::Set(EmbeddingSettings { - source: Setting::Set(milli::vector::settings::EmbedderSource::HuggingFace), - model: Setting::Set(S("sentence-transformers/all-MiniLM-L6-v2")), - revision: Setting::Set(S("e4ce9877abf3edfe10b0d82785e83bdcb973e22e")), - document_template: Setting::Set(S("{{doc.doggo}}")), - .. EmbeddingSettings::default() - }) - }); + let setting = meilisearch_types::settings::Settings:: { + embedders: Setting::Set(maplit::btreemap! { + S("my_doggo_embedder") => Setting::Set(EmbeddingSettings { + source: Setting::Set(milli::vector::settings::EmbedderSource::HuggingFace), + model: Setting::Set(S("sentence-transformers/all-MiniLM-L6-v2")), + revision: Setting::Set(S("e4ce9877abf3edfe10b0d82785e83bdcb973e22e")), + document_template: Setting::Set(S("{{doc.doggo}}")), + ..Default::default() + }) + }), + ..Default::default() + }; index_scheduler .register( KindWithContent::SettingsUpdate { @@ -5380,7 +5382,7 @@ mod tests { let mut embeddings = Vec::new(); 'vectors: for i in 0..=u8::MAX { - let reader = arroy::Reader::open(&rtxn, 0 | (i as u16), index.vector_arroy) + let reader = arroy::Reader::open(&rtxn, i as u16, index.vector_arroy) .map(Some) .or_else(|e| match e { arroy::Error::MissingMetadata => Ok(None), @@ -5418,7 +5420,7 @@ mod tests { ] ); - let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(1 as u128).unwrap(); + let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(1_u128).unwrap(); let documents_count = read_json(serde_json::to_string_pretty(&content).unwrap().as_bytes(), &mut file) .unwrap(); From 6607875f49b3047a3fe6d8771a700d21f36a0b9e Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 5 Jun 2024 23:40:29 +0200 Subject: [PATCH 047/110] add the retrieveVectors parameter to the get and fetch documents route --- meilisearch-types/src/error.rs | 1 + meilisearch/src/routes/indexes/documents.rs | 80 ++++-- meilisearch/tests/common/index.rs | 37 +-- meilisearch/tests/common/mod.rs | 2 +- meilisearch/tests/documents/errors.rs | 24 ++ meilisearch/tests/documents/get_documents.rs | 268 ++++++++++++++++--- 6 files changed, 325 insertions(+), 87 deletions(-) diff --git a/meilisearch-types/src/error.rs b/meilisearch-types/src/error.rs index 63543fb1b..ae2a753db 100644 --- a/meilisearch-types/src/error.rs +++ b/meilisearch-types/src/error.rs @@ -222,6 +222,7 @@ InvalidApiKeyUid , InvalidRequest , BAD_REQUEST ; InvalidContentType , InvalidRequest , UNSUPPORTED_MEDIA_TYPE ; InvalidDocumentCsvDelimiter , InvalidRequest , BAD_REQUEST ; InvalidDocumentFields , InvalidRequest , BAD_REQUEST ; +InvalidDocumentRetrieveVectors , InvalidRequest , BAD_REQUEST ; MissingDocumentFilter , InvalidRequest , BAD_REQUEST ; InvalidDocumentFilter , InvalidRequest , BAD_REQUEST ; InvalidDocumentGeoField , InvalidRequest , BAD_REQUEST ; diff --git a/meilisearch/src/routes/indexes/documents.rs b/meilisearch/src/routes/indexes/documents.rs index 43fab1dae..97ded8069 100644 --- a/meilisearch/src/routes/indexes/documents.rs +++ b/meilisearch/src/routes/indexes/documents.rs @@ -16,6 +16,7 @@ use meilisearch_types::error::{Code, ResponseError}; use meilisearch_types::heed::RoTxn; use meilisearch_types::index_uid::IndexUid; use meilisearch_types::milli::update::IndexDocumentsMethod; +use meilisearch_types::milli::vector::parsed_vectors::ExplicitVectors; use meilisearch_types::milli::DocumentId; use meilisearch_types::star_or::OptionStarOrList; use meilisearch_types::tasks::KindWithContent; @@ -94,6 +95,8 @@ pub fn configure(cfg: &mut web::ServiceConfig) { pub struct GetDocument { #[deserr(default, error = DeserrQueryParamError)] fields: OptionStarOrList, + #[deserr(default, error = DeserrQueryParamError)] + retrieve_vectors: Param, } pub async fn get_document( @@ -109,11 +112,12 @@ pub async fn get_document( analytics.get_fetch_documents(&DocumentFetchKind::PerDocumentId, &req); - let GetDocument { fields } = params.into_inner(); + let GetDocument { fields, retrieve_vectors } = params.into_inner(); let attributes_to_retrieve = fields.merge_star_and_none(); let index = index_scheduler.index(&index_uid)?; - let document = retrieve_document(&index, &document_id, attributes_to_retrieve)?; + let document = + retrieve_document(&index, &document_id, attributes_to_retrieve, retrieve_vectors.0)?; debug!(returns = ?document, "Get document"); Ok(HttpResponse::Ok().json(document)) } @@ -153,6 +157,8 @@ pub struct BrowseQueryGet { limit: Param, #[deserr(default, error = DeserrQueryParamError)] fields: OptionStarOrList, + #[deserr(default, error = DeserrQueryParamError)] + retrieve_vectors: Param, #[deserr(default, error = DeserrQueryParamError)] filter: Option, } @@ -166,6 +172,8 @@ pub struct BrowseQuery { limit: usize, #[deserr(default, error = DeserrJsonError)] fields: Option>, + #[deserr(default, error = DeserrJsonError)] + retrieve_vectors: bool, #[deserr(default, error = DeserrJsonError)] filter: Option, } @@ -201,7 +209,7 @@ pub async fn get_documents( ) -> Result { debug!(parameters = ?params, "Get documents GET"); - let BrowseQueryGet { limit, offset, fields, filter } = params.into_inner(); + let BrowseQueryGet { limit, offset, fields, retrieve_vectors, filter } = params.into_inner(); let filter = match filter { Some(f) => match serde_json::from_str(&f) { @@ -215,6 +223,7 @@ pub async fn get_documents( offset: offset.0, limit: limit.0, fields: fields.merge_star_and_none(), + retrieve_vectors: retrieve_vectors.0, filter, }; @@ -236,10 +245,11 @@ fn documents_by_query( query: BrowseQuery, ) -> Result { let index_uid = IndexUid::try_from(index_uid.into_inner())?; - let BrowseQuery { offset, limit, fields, filter } = query; + let BrowseQuery { offset, limit, fields, retrieve_vectors, filter } = query; let index = index_scheduler.index(&index_uid)?; - let (total, documents) = retrieve_documents(&index, offset, limit, filter, fields)?; + let (total, documents) = + retrieve_documents(&index, offset, limit, filter, fields, retrieve_vectors)?; let ret = PaginationView::new(offset, limit, total as usize, documents); @@ -579,13 +589,33 @@ fn some_documents<'a, 't: 'a>( index: &'a Index, rtxn: &'t RoTxn, doc_ids: impl IntoIterator + 'a, + retrieve_vectors: bool, ) -> Result> + 'a, ResponseError> { let fields_ids_map = index.fields_ids_map(rtxn)?; let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect(); + let embedding_configs = index.embedding_configs(rtxn)?; Ok(index.iter_documents(rtxn, doc_ids)?.map(move |ret| { - ret.map_err(ResponseError::from).and_then(|(_key, document)| -> Result<_, ResponseError> { - Ok(milli::obkv_to_json(&all_fields, &fields_ids_map, document)?) + ret.map_err(ResponseError::from).and_then(|(key, document)| -> Result<_, ResponseError> { + let mut document = milli::obkv_to_json(&all_fields, &fields_ids_map, document)?; + + if retrieve_vectors { + let mut vectors = serde_json::Map::new(); + for (name, vector) in index.embeddings(rtxn, key)? { + let user_provided = embedding_configs + .iter() + .find(|conf| conf.name == name) + .is_some_and(|conf| conf.user_provided.contains(key)); + let embeddings = ExplicitVectors { embeddings: vector.into(), user_provided }; + vectors.insert( + name, + serde_json::to_value(embeddings).map_err(MeilisearchHttpError::from)?, + ); + } + document.insert("_vectors".into(), vectors.into()); + } + + Ok(document) }) })) } @@ -596,6 +626,7 @@ fn retrieve_documents>( limit: usize, filter: Option, attributes_to_retrieve: Option>, + retrieve_vectors: bool, ) -> Result<(u64, Vec), ResponseError> { let rtxn = index.read_txn()?; let filter = &filter; @@ -620,53 +651,58 @@ fn retrieve_documents>( let (it, number_of_documents) = { let number_of_documents = candidates.len(); ( - some_documents(index, &rtxn, candidates.into_iter().skip(offset).take(limit))?, + some_documents( + index, + &rtxn, + candidates.into_iter().skip(offset).take(limit), + retrieve_vectors, + )?, number_of_documents, ) }; - let documents: Result, ResponseError> = it + let documents: Vec<_> = it .map(|document| { Ok(match &attributes_to_retrieve { Some(attributes_to_retrieve) => permissive_json_pointer::select_values( &document?, - attributes_to_retrieve.iter().map(|s| s.as_ref()), + attributes_to_retrieve + .iter() + .map(|s| s.as_ref()) + .chain(retrieve_vectors.then_some("_vectors")), ), None => document?, }) }) - .collect(); + .collect::>()?; - Ok((number_of_documents, documents?)) + Ok((number_of_documents, documents)) } fn retrieve_document>( index: &Index, doc_id: &str, attributes_to_retrieve: Option>, + retrieve_vectors: bool, ) -> Result { let txn = index.read_txn()?; - let fields_ids_map = index.fields_ids_map(&txn)?; - let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect(); - let internal_id = index .external_documents_ids() .get(&txn, doc_id)? .ok_or_else(|| MeilisearchHttpError::DocumentNotFound(doc_id.to_string()))?; - let document = index - .documents(&txn, std::iter::once(internal_id))? - .into_iter() + let document = some_documents(index, &txn, Some(internal_id), retrieve_vectors)? .next() - .map(|(_, d)| d) - .ok_or_else(|| MeilisearchHttpError::DocumentNotFound(doc_id.to_string()))?; + .ok_or_else(|| MeilisearchHttpError::DocumentNotFound(doc_id.to_string()))??; - let document = meilisearch_types::milli::obkv_to_json(&all_fields, &fields_ids_map, document)?; let document = match &attributes_to_retrieve { Some(attributes_to_retrieve) => permissive_json_pointer::select_values( &document, - attributes_to_retrieve.iter().map(|s| s.as_ref()), + attributes_to_retrieve + .iter() + .map(|s| s.as_ref()) + .chain(retrieve_vectors.then_some("_vectors")), ), None => document, }; diff --git a/meilisearch/tests/common/index.rs b/meilisearch/tests/common/index.rs index 3ac33b4e9..f81fe8c8a 100644 --- a/meilisearch/tests/common/index.rs +++ b/meilisearch/tests/common/index.rs @@ -182,14 +182,10 @@ impl Index<'_> { self.service.get(url).await } - pub async fn get_document( - &self, - id: u64, - options: Option, - ) -> (Value, StatusCode) { + pub async fn get_document(&self, id: u64, options: Option) -> (Value, StatusCode) { let mut url = format!("/indexes/{}/documents/{}", urlencode(self.uid.as_ref()), id); - if let Some(fields) = options.and_then(|o| o.fields) { - let _ = write!(url, "?fields={}", fields.join(",")); + if let Some(options) = options { + write!(url, "?{}", yaup::to_string(&options).unwrap()).unwrap(); } self.service.get(url).await } @@ -205,18 +201,11 @@ impl Index<'_> { } pub async fn get_all_documents(&self, options: GetAllDocumentsOptions) -> (Value, StatusCode) { - let mut url = format!("/indexes/{}/documents?", urlencode(self.uid.as_ref())); - if let Some(limit) = options.limit { - let _ = write!(url, "limit={}&", limit); - } - - if let Some(offset) = options.offset { - let _ = write!(url, "offset={}&", offset); - } - - if let Some(attributes_to_retrieve) = options.attributes_to_retrieve { - let _ = write!(url, "fields={}&", attributes_to_retrieve.join(",")); - } + let url = format!( + "/indexes/{}/documents?{}", + urlencode(self.uid.as_ref()), + yaup::to_string(&options).unwrap() + ); self.service.get(url).await } @@ -435,13 +424,11 @@ impl Index<'_> { } } -pub struct GetDocumentOptions { - pub fields: Option>, -} - -#[derive(Debug, Default)] +#[derive(Debug, Default, serde::Serialize)] +#[serde(rename_all = "camelCase")] pub struct GetAllDocumentsOptions { pub limit: Option, pub offset: Option, - pub attributes_to_retrieve: Option>, + pub fields: Option>, + pub retrieve_vectors: bool, } diff --git a/meilisearch/tests/common/mod.rs b/meilisearch/tests/common/mod.rs index 3117dd185..317e5e171 100644 --- a/meilisearch/tests/common/mod.rs +++ b/meilisearch/tests/common/mod.rs @@ -6,7 +6,7 @@ pub mod service; use std::fmt::{self, Display}; #[allow(unused)] -pub use index::{GetAllDocumentsOptions, GetDocumentOptions}; +pub use index::GetAllDocumentsOptions; use meili_snap::json_string; use serde::{Deserialize, Serialize}; #[allow(unused)] diff --git a/meilisearch/tests/documents/errors.rs b/meilisearch/tests/documents/errors.rs index cd2d89813..cd1be4dc4 100644 --- a/meilisearch/tests/documents/errors.rs +++ b/meilisearch/tests/documents/errors.rs @@ -795,3 +795,27 @@ async fn fetch_document_by_filter() { } "###); } + +#[actix_rt::test] +async fn retrieve_vectors() { + let server = Server::new().await; + let index = server.index("doggo"); + let (response, _code) = index.get_all_documents_raw("?retrieveVectors=tamo").await; + snapshot!(json_string!(response), @r###" + { + "message": "Invalid value in parameter `retrieveVectors`: could not parse `tamo` as a boolean, expected either `true` or `false`", + "code": "invalid_document_retrieve_vectors", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_document_retrieve_vectors" + } + "###); + let (response, _code) = index.get_document(0, Some(json!({"retrieveVectors": "tamo"}))).await; + snapshot!(json_string!(response), @r###" + { + "message": "Invalid value in parameter `retrieveVectors`: could not parse `tamo` as a boolean, expected either `true` or `false`", + "code": "invalid_document_retrieve_vectors", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_document_retrieve_vectors" + } + "###); +} diff --git a/meilisearch/tests/documents/get_documents.rs b/meilisearch/tests/documents/get_documents.rs index 3b0629fcb..1ade00b06 100644 --- a/meilisearch/tests/documents/get_documents.rs +++ b/meilisearch/tests/documents/get_documents.rs @@ -4,7 +4,7 @@ use meili_snap::*; use urlencoding::encode as urlencode; use crate::common::encoder::Encoder; -use crate::common::{GetAllDocumentsOptions, GetDocumentOptions, Server, Value}; +use crate::common::{GetAllDocumentsOptions, Server, Value}; use crate::json; // TODO: partial test since we are testing error, amd error is not yet fully implemented in @@ -59,8 +59,7 @@ async fn get_document() { }) ); - let (response, code) = - index.get_document(0, Some(GetDocumentOptions { fields: Some(vec!["id"]) })).await; + let (response, code) = index.get_document(0, Some(json!({ "fields": ["id"] }))).await; assert_eq!(code, 200); assert_eq!( response, @@ -69,9 +68,8 @@ async fn get_document() { }) ); - let (response, code) = index - .get_document(0, Some(GetDocumentOptions { fields: Some(vec!["nested.content"]) })) - .await; + let (response, code) = + index.get_document(0, Some(json!({ "fields": ["nested.content"] }))).await; assert_eq!(code, 200); assert_eq!( response, @@ -211,7 +209,7 @@ async fn test_get_all_documents_attributes_to_retrieve() { let (response, code) = index .get_all_documents(GetAllDocumentsOptions { - attributes_to_retrieve: Some(vec!["name"]), + fields: Some(vec!["name"]), ..Default::default() }) .await; @@ -225,9 +223,21 @@ async fn test_get_all_documents_attributes_to_retrieve() { assert_eq!(response["limit"], json!(20)); assert_eq!(response["total"], json!(77)); + let (response, code) = index + .get_all_documents(GetAllDocumentsOptions { fields: Some(vec![]), ..Default::default() }) + .await; + assert_eq!(code, 200); + assert_eq!(response["results"].as_array().unwrap().len(), 20); + for results in response["results"].as_array().unwrap() { + assert_eq!(results.as_object().unwrap().keys().count(), 0); + } + assert_eq!(response["offset"], json!(0)); + assert_eq!(response["limit"], json!(20)); + assert_eq!(response["total"], json!(77)); + let (response, code) = index .get_all_documents(GetAllDocumentsOptions { - attributes_to_retrieve: Some(vec![]), + fields: Some(vec!["wrong"]), ..Default::default() }) .await; @@ -242,22 +252,7 @@ async fn test_get_all_documents_attributes_to_retrieve() { let (response, code) = index .get_all_documents(GetAllDocumentsOptions { - attributes_to_retrieve: Some(vec!["wrong"]), - ..Default::default() - }) - .await; - assert_eq!(code, 200); - assert_eq!(response["results"].as_array().unwrap().len(), 20); - for results in response["results"].as_array().unwrap() { - assert_eq!(results.as_object().unwrap().keys().count(), 0); - } - assert_eq!(response["offset"], json!(0)); - assert_eq!(response["limit"], json!(20)); - assert_eq!(response["total"], json!(77)); - - let (response, code) = index - .get_all_documents(GetAllDocumentsOptions { - attributes_to_retrieve: Some(vec!["name", "tags"]), + fields: Some(vec!["name", "tags"]), ..Default::default() }) .await; @@ -270,10 +265,7 @@ async fn test_get_all_documents_attributes_to_retrieve() { } let (response, code) = index - .get_all_documents(GetAllDocumentsOptions { - attributes_to_retrieve: Some(vec!["*"]), - ..Default::default() - }) + .get_all_documents(GetAllDocumentsOptions { fields: Some(vec!["*"]), ..Default::default() }) .await; assert_eq!(code, 200); assert_eq!(response["results"].as_array().unwrap().len(), 20); @@ -283,7 +275,7 @@ async fn test_get_all_documents_attributes_to_retrieve() { let (response, code) = index .get_all_documents(GetAllDocumentsOptions { - attributes_to_retrieve: Some(vec!["*", "wrong"]), + fields: Some(vec!["*", "wrong"]), ..Default::default() }) .await; @@ -316,12 +308,10 @@ async fn get_document_s_nested_attributes_to_retrieve() { assert_eq!(code, 202); index.wait_task(1).await; - let (response, code) = - index.get_document(0, Some(GetDocumentOptions { fields: Some(vec!["content"]) })).await; + let (response, code) = index.get_document(0, Some(json!({ "fields": ["content"] }))).await; assert_eq!(code, 200); assert_eq!(response, json!({})); - let (response, code) = - index.get_document(1, Some(GetDocumentOptions { fields: Some(vec!["content"]) })).await; + let (response, code) = index.get_document(1, Some(json!({ "fields": ["content"] }))).await; assert_eq!(code, 200); assert_eq!( response, @@ -333,9 +323,7 @@ async fn get_document_s_nested_attributes_to_retrieve() { }) ); - let (response, code) = index - .get_document(0, Some(GetDocumentOptions { fields: Some(vec!["content.truc"]) })) - .await; + let (response, code) = index.get_document(0, Some(json!({ "fields": ["content.truc"] }))).await; assert_eq!(code, 200); assert_eq!( response, @@ -343,9 +331,7 @@ async fn get_document_s_nested_attributes_to_retrieve() { "content.truc": "foobar", }) ); - let (response, code) = index - .get_document(1, Some(GetDocumentOptions { fields: Some(vec!["content.truc"]) })) - .await; + let (response, code) = index.get_document(1, Some(json!({ "fields": ["content.truc"] }))).await; assert_eq!(code, 200); assert_eq!( response, @@ -540,3 +526,207 @@ async fn get_document_by_filter() { } "###); } + +#[actix_rt::test] +async fn get_document_with_vectors() { + let server = Server::new().await; + let index = server.index("doggo"); + let (value, code) = server.set_features(json!({"vectorStore": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(value, @r###" + { + "vectorStore": true, + "metrics": false, + "logsRoute": false + } + "###); + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "manual": { + "source": "userProvided", + "dimensions": 3, + } + }, + })) + .await; + snapshot!(code, @"202 Accepted"); + server.wait_task(response.uid()).await; + + let documents = json!([ + {"id": 0, "name": "kefir", "_vectors": { "manual": [0, 0, 0] }}, + {"id": 1, "name": "echo", "_vectors": { "manual": null }}, + ]); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + index.wait_task(value.uid()).await; + + // by default you shouldn't see the `_vectors` object + let (documents, _code) = index.get_all_documents(Default::default()).await; + snapshot!(json_string!(documents), @r###" + { + "results": [ + { + "id": 0, + "name": "kefir" + }, + { + "id": 1, + "name": "echo" + } + ], + "offset": 0, + "limit": 20, + "total": 2 + } + "###); + let (documents, _code) = index.get_document(0, None).await; + snapshot!(json_string!(documents), @r###" + { + "id": 0, + "name": "kefir" + } + "###); + + // if we try to retrieve the vectors with the `fields` parameter they + // still shouldn't be displayed + let (documents, _code) = index + .get_all_documents(GetAllDocumentsOptions { + fields: Some(vec!["name", "_vectors"]), + ..Default::default() + }) + .await; + snapshot!(json_string!(documents), @r###" + { + "results": [ + { + "name": "kefir" + }, + { + "name": "echo" + } + ], + "offset": 0, + "limit": 20, + "total": 2 + } + "###); + let (documents, _code) = + index.get_document(0, Some(json!({"fields": ["name", "_vectors"]}))).await; + snapshot!(json_string!(documents), @r###" + { + "name": "kefir" + } + "###); + + // If we specify the retrieve vectors boolean and nothing else we should get the vectors + let (documents, _code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; + snapshot!(json_string!(documents), @r###" + { + "results": [ + { + "id": 0, + "name": "kefir", + "_vectors": { + "manual": { + "embeddings": [ + [ + 0.0, + 0.0, + 0.0 + ] + ], + "userProvided": true + } + } + }, + { + "id": 1, + "name": "echo", + "_vectors": {} + } + ], + "offset": 0, + "limit": 20, + "total": 2 + } + "###); + let (documents, _code) = index.get_document(0, Some(json!({"retrieveVectors": true}))).await; + snapshot!(json_string!(documents), @r###" + { + "id": 0, + "name": "kefir", + "_vectors": { + "manual": { + "embeddings": [ + [ + 0.0, + 0.0, + 0.0 + ] + ], + "userProvided": true + } + } + } + "###); + + // If we specify the retrieve vectors boolean and exclude vectors form the `fields` we should still get the vectors + let (documents, _code) = index + .get_all_documents(GetAllDocumentsOptions { + retrieve_vectors: true, + fields: Some(vec!["name"]), + ..Default::default() + }) + .await; + snapshot!(json_string!(documents), @r###" + { + "results": [ + { + "name": "kefir", + "_vectors": { + "manual": { + "embeddings": [ + [ + 0.0, + 0.0, + 0.0 + ] + ], + "userProvided": true + } + } + }, + { + "name": "echo", + "_vectors": {} + } + ], + "offset": 0, + "limit": 20, + "total": 2 + } + "###); + let (documents, _code) = + index.get_document(0, Some(json!({"retrieveVectors": true, "fields": ["name"]}))).await; + snapshot!(json_string!(documents), @r###" + { + "name": "kefir", + "_vectors": { + "manual": { + "embeddings": [ + [ + 0.0, + 0.0, + 0.0 + ] + ], + "userProvided": true + } + } + } + "###); +} From 2cdcb703d9995f1f5f59f9dc60c47830bea2bdb9 Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 6 Jun 2024 10:41:16 +0200 Subject: [PATCH 048/110] fix the deletion of vectors and add a test --- meilisearch/tests/integration.rs | 1 + meilisearch/tests/vector/mod.rs | 149 ++++++++++++++++++ .../extract/extract_vector_points.rs | 3 +- 3 files changed, 152 insertions(+), 1 deletion(-) create mode 100644 meilisearch/tests/vector/mod.rs diff --git a/meilisearch/tests/integration.rs b/meilisearch/tests/integration.rs index bb77ecc63..78da9825a 100644 --- a/meilisearch/tests/integration.rs +++ b/meilisearch/tests/integration.rs @@ -13,6 +13,7 @@ mod snapshot; mod stats; mod swap_indexes; mod tasks; +mod vector; // Tests are isolated by features in different modules to allow better readability, test // targetability, and improved incremental compilation times. diff --git a/meilisearch/tests/vector/mod.rs b/meilisearch/tests/vector/mod.rs new file mode 100644 index 000000000..b4350116f --- /dev/null +++ b/meilisearch/tests/vector/mod.rs @@ -0,0 +1,149 @@ +use meili_snap::{json_string, snapshot}; + +use crate::common::{GetAllDocumentsOptions, Server}; +use crate::json; + +#[actix_rt::test] +async fn add_remove_user_provided() { + let server = Server::new().await; + let index = server.index("doggo"); + let (value, code) = server.set_features(json!({"vectorStore": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(value, @r###" + { + "vectorStore": true, + "metrics": false, + "logsRoute": false + } + "###); + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "manual": { + "source": "userProvided", + "dimensions": 3, + } + }, + })) + .await; + snapshot!(code, @"202 Accepted"); + server.wait_task(response.uid()).await; + + let documents = json!([ + {"id": 0, "name": "kefir", "_vectors": { "manual": [0, 0, 0] }}, + {"id": 1, "name": "echo", "_vectors": { "manual": [1, 1, 1] }}, + ]); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + index.wait_task(value.uid()).await; + + let (documents, _code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; + snapshot!(json_string!(documents), @r###" + { + "results": [ + { + "id": 0, + "name": "kefir", + "_vectors": { + "manual": { + "embeddings": [ + [ + 0.0, + 0.0, + 0.0 + ] + ], + "userProvided": true + } + } + }, + { + "id": 1, + "name": "echo", + "_vectors": { + "manual": { + "embeddings": [ + [ + 1.0, + 1.0, + 1.0 + ] + ], + "userProvided": true + } + } + } + ], + "offset": 0, + "limit": 20, + "total": 2 + } + "###); + + let documents = json!([ + {"id": 0, "name": "kefir", "_vectors": { "manual": [10, 10, 10] }}, + {"id": 1, "name": "echo", "_vectors": { "manual": null }}, + ]); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + index.wait_task(value.uid()).await; + + let (documents, _code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; + snapshot!(json_string!(documents), @r###" + { + "results": [ + { + "id": 0, + "name": "kefir", + "_vectors": { + "manual": { + "embeddings": [ + [ + 10.0, + 10.0, + 10.0 + ] + ], + "userProvided": true + } + } + }, + { + "id": 1, + "name": "echo", + "_vectors": {} + } + ], + "offset": 0, + "limit": 20, + "total": 2 + } + "###); + + let (value, code) = index.delete_document(0).await; + snapshot!(code, @"202 Accepted"); + index.wait_task(value.uid()).await; + + let (documents, _code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; + snapshot!(json_string!(documents), @r###" + { + "results": [ + { + "id": 1, + "name": "echo", + "_vectors": {} + } + ], + "offset": 0, + "limit": 20, + "total": 1 + } + "###); +} diff --git a/milli/src/update/index_documents/extract/extract_vector_points.rs b/milli/src/update/index_documents/extract/extract_vector_points.rs index 88c42864e..964cb35e8 100644 --- a/milli/src/update/index_documents/extract/extract_vector_points.rs +++ b/milli/src/update/index_documents/extract/extract_vector_points.rs @@ -63,7 +63,8 @@ impl VectorStateDelta { VectorStateDelta::NoChange => Default::default(), VectorStateDelta::NowRemoved => (true, Default::default(), Default::default()), VectorStateDelta::WasGeneratedNowManual(add) => (true, Default::default(), add), - VectorStateDelta::ManualDelta(add) => (false, Default::default(), add), + // We always delete the previous vectors + VectorStateDelta::ManualDelta(add) => (true, Default::default(), add), VectorStateDelta::NowGenerated(prompt) => (true, prompt, Default::default()), } } From 63dded3961863a319c2c14e40f6506241574ca9d Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 6 Jun 2024 11:29:16 +0200 Subject: [PATCH 049/110] implements the new analytics for the get documents routes --- meilisearch/src/analytics/mod.rs | 4 ++-- meilisearch/src/analytics/segment_analytics.rs | 17 +++++++++++++---- meilisearch/src/routes/indexes/documents.rs | 7 ++++++- 3 files changed, 21 insertions(+), 7 deletions(-) diff --git a/meilisearch/src/analytics/mod.rs b/meilisearch/src/analytics/mod.rs index 3468ad2c7..6863dc57b 100644 --- a/meilisearch/src/analytics/mod.rs +++ b/meilisearch/src/analytics/mod.rs @@ -74,8 +74,8 @@ pub enum DocumentDeletionKind { #[derive(Copy, Clone, Debug, PartialEq, Eq)] pub enum DocumentFetchKind { - PerDocumentId, - Normal { with_filter: bool, limit: usize, offset: usize }, + PerDocumentId { retrieve_vectors: bool }, + Normal { with_filter: bool, limit: usize, offset: usize, retrieve_vectors: bool }, } pub trait Analytics: Sync + Send { diff --git a/meilisearch/src/analytics/segment_analytics.rs b/meilisearch/src/analytics/segment_analytics.rs index 6e91b99b0..56a781c47 100644 --- a/meilisearch/src/analytics/segment_analytics.rs +++ b/meilisearch/src/analytics/segment_analytics.rs @@ -1542,6 +1542,9 @@ pub struct DocumentsFetchAggregator { // if a filter was used per_filter: bool, + #[serde(rename = "vector.retrieve_vectors")] + retrieve_vectors: bool, + // pagination #[serde(rename = "pagination.max_limit")] max_limit: usize, @@ -1551,18 +1554,21 @@ pub struct DocumentsFetchAggregator { impl DocumentsFetchAggregator { pub fn from_query(query: &DocumentFetchKind, request: &HttpRequest) -> Self { - let (limit, offset) = match query { - DocumentFetchKind::PerDocumentId => (1, 0), - DocumentFetchKind::Normal { limit, offset, .. } => (*limit, *offset), + let (limit, offset, retrieve_vectors) = match query { + DocumentFetchKind::PerDocumentId { retrieve_vectors } => (1, 0, *retrieve_vectors), + DocumentFetchKind::Normal { limit, offset, retrieve_vectors, .. } => { + (*limit, *offset, *retrieve_vectors) + } }; Self { timestamp: Some(OffsetDateTime::now_utc()), user_agents: extract_user_agents(request).into_iter().collect(), total_received: 1, - per_document_id: matches!(query, DocumentFetchKind::PerDocumentId), + per_document_id: matches!(query, DocumentFetchKind::PerDocumentId { .. }), per_filter: matches!(query, DocumentFetchKind::Normal { with_filter, .. } if *with_filter), max_limit: limit, max_offset: offset, + retrieve_vectors, } } @@ -1576,6 +1582,7 @@ impl DocumentsFetchAggregator { per_filter, max_limit, max_offset, + retrieve_vectors, } = other; if self.timestamp.is_none() { @@ -1591,6 +1598,8 @@ impl DocumentsFetchAggregator { self.max_limit = self.max_limit.max(max_limit); self.max_offset = self.max_offset.max(max_offset); + + self.retrieve_vectors |= retrieve_vectors; } pub fn into_event(self, user: &User, event_name: &str) -> Option { diff --git a/meilisearch/src/routes/indexes/documents.rs b/meilisearch/src/routes/indexes/documents.rs index 97ded8069..81e297d54 100644 --- a/meilisearch/src/routes/indexes/documents.rs +++ b/meilisearch/src/routes/indexes/documents.rs @@ -110,7 +110,10 @@ pub async fn get_document( debug!(parameters = ?params, "Get document"); let index_uid = IndexUid::try_from(index_uid)?; - analytics.get_fetch_documents(&DocumentFetchKind::PerDocumentId, &req); + analytics.get_fetch_documents( + &DocumentFetchKind::PerDocumentId { retrieve_vectors: params.retrieve_vectors.0 }, + &req, + ); let GetDocument { fields, retrieve_vectors } = params.into_inner(); let attributes_to_retrieve = fields.merge_star_and_none(); @@ -193,6 +196,7 @@ pub async fn documents_by_query_post( with_filter: body.filter.is_some(), limit: body.limit, offset: body.offset, + retrieve_vectors: body.retrieve_vectors, }, &req, ); @@ -232,6 +236,7 @@ pub async fn get_documents( with_filter: query.filter.is_some(), limit: query.limit, offset: query.offset, + retrieve_vectors: query.retrieve_vectors, }, &req, ); From 531e3d7d6ad7b09d81561502a0234afd864cdfe9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 4 Jun 2024 15:01:26 -0400 Subject: [PATCH 050/110] MultiOps trait for OR operations --- milli/src/search/facet/filter.rs | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index dbd9538a5..adeec45da 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -4,7 +4,7 @@ use std::ops::Bound::{self, Excluded, Included}; use either::Either; pub use filter_parser::{Condition, Error as FPError, FilterCondition, Token}; -use roaring::RoaringBitmap; +use roaring::{MultiOps, RoaringBitmap}; use serde_json::Value; use super::facet_range_search; @@ -382,14 +382,10 @@ impl<'a> Filter<'a> { }))? } } - FilterCondition::Or(subfilters) => { - let mut bitmap = RoaringBitmap::new(); - for f in subfilters { - bitmap |= - Self::inner_evaluate(&(f.clone()).into(), rtxn, index, filterable_fields)?; - } - Ok(bitmap) - } + FilterCondition::Or(subfilters) => subfilters + .iter() + .map(|f| Self::inner_evaluate(&(f.clone()).into(), rtxn, index, filterable_fields)) + .union(), FilterCondition::And(subfilters) => { let mut subfilters_iter = subfilters.iter(); if let Some(first_subfilter) = subfilters_iter.next() { From ff2e498267c367a17e9a74da5c3e309d61db1bf7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 5 Jun 2024 15:23:26 -0400 Subject: [PATCH 051/110] Patch roaring to use the version supporting intersection on deserialization --- Cargo.lock | 12 ++---------- Cargo.toml | 3 +++ 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b62a61f92..e72d72251 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4377,12 +4377,6 @@ dependencies = [ "winreg", ] -[[package]] -name = "retain_mut" -version = "0.1.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c31b5c4033f8fdde8700e4657be2c497e7288f01515be52168c631e2e4d4086" - [[package]] name = "ring" version = "0.17.8" @@ -4400,13 +4394,11 @@ dependencies = [ [[package]] name = "roaring" -version = "0.10.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6106b5cf8587f5834158895e9715a3c6c9716c8aefab57f1f7680917191c7873" +version = "0.10.4" +source = "git+https://github.com/RoaringBitmap/roaring-rs?branch=intersection-with-serialized#4466ae0104ed44a8cf41d187d9359483fe190701" dependencies = [ "bytemuck", "byteorder", - "retain_mut", "serde", ] diff --git a/Cargo.toml b/Cargo.toml index 5c6c8b376..f49d1fd44 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -64,3 +64,6 @@ opt-level = 3 opt-level = 3 [profile.bench.package.yada] opt-level = 3 + +[patch.crates-io] +roaring = { git = "https://github.com/RoaringBitmap/roaring-rs", branch = "intersection-with-serialized" } From e4a69c5ac320a9f6c9f160c84bcf1346353c1aea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 5 Jun 2024 15:06:25 -0400 Subject: [PATCH 052/110] Introduce the FacetGroupLazyValue type --- milli/src/heed_codec/facet/mod.rs | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/milli/src/heed_codec/facet/mod.rs b/milli/src/heed_codec/facet/mod.rs index 7bb874060..a8bb5055e 100644 --- a/milli/src/heed_codec/facet/mod.rs +++ b/milli/src/heed_codec/facet/mod.rs @@ -47,6 +47,12 @@ pub struct FacetGroupValue { pub bitmap: RoaringBitmap, } +#[derive(Debug)] +pub struct FacetGroupLazyValue<'b> { + pub size: u8, + pub bitmap_bytes: &'b [u8], +} + pub struct FacetGroupKeyCodec { _phantom: PhantomData, } @@ -69,6 +75,7 @@ where Ok(Cow::Owned(v)) } } + impl<'a, T> heed::BytesDecode<'a> for FacetGroupKeyCodec where T: BytesDecode<'a>, @@ -84,6 +91,7 @@ where } pub struct FacetGroupValueCodec; + impl<'a> heed::BytesEncode<'a> for FacetGroupValueCodec { type EItem = FacetGroupValue; @@ -93,11 +101,23 @@ impl<'a> heed::BytesEncode<'a> for FacetGroupValueCodec { Ok(Cow::Owned(v)) } } + impl<'a> heed::BytesDecode<'a> for FacetGroupValueCodec { type DItem = FacetGroupValue; + fn bytes_decode(bytes: &'a [u8]) -> Result { let size = bytes[0]; let bitmap = CboRoaringBitmapCodec::deserialize_from(&bytes[1..])?; Ok(FacetGroupValue { size, bitmap }) } } + +pub struct FacetGroupLazyValueCodec; + +impl<'a> heed::BytesDecode<'a> for FacetGroupLazyValueCodec { + type DItem = FacetGroupLazyValue<'a>; + + fn bytes_decode(bytes: &'a [u8]) -> Result { + Ok(FacetGroupLazyValue { size: bytes[0], bitmap_bytes: &bytes[1..] }) + } +} From 4ca4a3f954b2a2df17a6aaf0ccf77fafccd65d7e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 5 Jun 2024 15:06:57 -0400 Subject: [PATCH 053/110] Make the CboRoaringBitmapCodec support intersection on deserialization --- .../cbo_roaring_bitmap_codec.rs | 20 ++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs index 1db518c7d..a04698019 100644 --- a/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs +++ b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs @@ -1,5 +1,5 @@ use std::borrow::Cow; -use std::io; +use std::io::{self, Cursor}; use std::mem::size_of; use byteorder::{NativeEndian, ReadBytesExt, WriteBytesExt}; @@ -57,6 +57,24 @@ impl CboRoaringBitmapCodec { } } + pub fn intersection_with_serialized( + mut bytes: &[u8], + other: &RoaringBitmap, + ) -> io::Result { + // See above `deserialize_from` method for implementation details. + if bytes.len() <= THRESHOLD * size_of::() { + let mut bitmap = RoaringBitmap::new(); + while let Ok(integer) = bytes.read_u32::() { + if other.contains(integer) { + bitmap.insert(integer); + } + } + Ok(bitmap) + } else { + other.intersection_with_serialized_unchecked(Cursor::new(bytes)) + } + } + /// Merge serialized CboRoaringBitmaps in a buffer. /// /// if the merged values length is under the threshold, values are directly From a6f3a01c6a3a4d836f31544cc668c363e954e983 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 5 Jun 2024 15:10:00 -0400 Subject: [PATCH 054/110] Expose the universe to do efficient intersections on deserialization --- .../search/facet/facet_distribution_iter.rs | 4 +- milli/src/search/facet/facet_range_search.rs | 59 +++++++++++++++---- .../src/search/facet/facet_sort_ascending.rs | 2 +- .../src/search/facet/facet_sort_descending.rs | 4 +- milli/src/search/facet/filter.rs | 39 +++++++++--- milli/src/search/facet/mod.rs | 14 ++--- 6 files changed, 92 insertions(+), 30 deletions(-) diff --git a/milli/src/search/facet/facet_distribution_iter.rs b/milli/src/search/facet/facet_distribution_iter.rs index d993ef2dc..a8aa1a006 100644 --- a/milli/src/search/facet/facet_distribution_iter.rs +++ b/milli/src/search/facet/facet_distribution_iter.rs @@ -38,7 +38,7 @@ where field_id, )?; - if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { + if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { fd.iterate(candidates, highest_level, first_bound, usize::MAX)?; Ok(()) } else { @@ -81,7 +81,7 @@ where field_id, )?; - if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { + if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { // We first fill the heap with values from the highest level let starting_key = FacetGroupKey { field_id, level: highest_level, left_bound: first_bound }; diff --git a/milli/src/search/facet/facet_range_search.rs b/milli/src/search/facet/facet_range_search.rs index e340fbac5..0f8f58771 100644 --- a/milli/src/search/facet/facet_range_search.rs +++ b/milli/src/search/facet/facet_range_search.rs @@ -4,9 +4,11 @@ use heed::BytesEncode; use roaring::RoaringBitmap; use super::{get_first_facet_value, get_highest_level, get_last_facet_value}; -use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec}; +use crate::heed_codec::facet::{ + FacetGroupKey, FacetGroupKeyCodec, FacetGroupLazyValueCodec, FacetGroupValueCodec, +}; use crate::heed_codec::BytesRefCodec; -use crate::Result; +use crate::{CboRoaringBitmapCodec, Result}; /// Find all the document ids for which the given field contains a value contained within /// the two bounds. @@ -16,6 +18,7 @@ pub fn find_docids_of_facet_within_bounds<'t, BoundCodec>( field_id: u16, left: &'t Bound<>::EItem>, right: &'t Bound<>::EItem>, + universe: Option<&RoaringBitmap>, docids: &mut RoaringBitmap, ) -> Result<()> where @@ -46,13 +49,15 @@ where } Bound::Unbounded => Bound::Unbounded, }; - let db = db.remap_key_type::>(); - let mut f = FacetRangeSearch { rtxn, db, field_id, left, right, docids }; + let db = db.remap_types::, FacetGroupLazyValueCodec>(); + let mut f = FacetRangeSearch { rtxn, db, field_id, left, right, universe, docids }; let highest_level = get_highest_level(rtxn, db, field_id)?; - if let Some(starting_left_bound) = get_first_facet_value::(rtxn, db, field_id)? { + if let Some(starting_left_bound) = + get_first_facet_value::(rtxn, db, field_id)? + { let rightmost_bound = - Bound::Included(get_last_facet_value::(rtxn, db, field_id)?.unwrap()); // will not fail because get_first_facet_value succeeded + Bound::Included(get_last_facet_value::(rtxn, db, field_id)?.unwrap()); // will not fail because get_first_facet_value succeeded let group_size = usize::MAX; f.run(highest_level, starting_left_bound, rightmost_bound, group_size)?; Ok(()) @@ -64,12 +69,16 @@ where /// Fetch the document ids that have a facet with a value between the two given bounds struct FacetRangeSearch<'t, 'b, 'bitmap> { rtxn: &'t heed::RoTxn<'t>, - db: heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupLazyValueCodec>, field_id: u16, left: Bound<&'b [u8]>, right: Bound<&'b [u8]>, + /// The subset of documents ids that are useful for this search. + /// Great performance optimizations can be achieved by only fetching values matching this subset. + universe: Option<&'bitmap RoaringBitmap>, docids: &'bitmap mut RoaringBitmap, } + impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { fn run_level_0(&mut self, starting_left_bound: &'t [u8], group_size: usize) -> Result<()> { let left_key = @@ -104,7 +113,13 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { } if RangeBounds::<&[u8]>::contains(&(self.left, self.right), &key.left_bound) { - *self.docids |= value.bitmap; + *self.docids |= match self.universe { + Some(universe) => CboRoaringBitmapCodec::intersection_with_serialized( + value.bitmap_bytes, + universe, + )?, + None => CboRoaringBitmapCodec::deserialize_from(value.bitmap_bytes)?, + }; } } Ok(()) @@ -195,7 +210,13 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { left_condition && right_condition }; if should_take_whole_group { - *self.docids |= &previous_value.bitmap; + *self.docids |= match self.universe { + Some(universe) => CboRoaringBitmapCodec::intersection_with_serialized( + previous_value.bitmap_bytes, + universe, + )?, + None => CboRoaringBitmapCodec::deserialize_from(previous_value.bitmap_bytes)?, + }; previous_key = next_key; previous_value = next_value; continue; @@ -291,7 +312,13 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { left_condition && right_condition }; if should_take_whole_group { - *self.docids |= &previous_value.bitmap; + *self.docids |= match self.universe { + Some(universe) => CboRoaringBitmapCodec::intersection_with_serialized( + previous_value.bitmap_bytes, + universe, + )?, + None => CboRoaringBitmapCodec::deserialize_from(previous_value.bitmap_bytes)?, + }; } else { let level = level - 1; let starting_left_bound = previous_key.left_bound; @@ -365,6 +392,7 @@ mod tests { 0, &start, &end, + None, &mut docids, ) .unwrap(); @@ -384,6 +412,7 @@ mod tests { 0, &start, &end, + None, &mut docids, ) .unwrap(); @@ -418,6 +447,7 @@ mod tests { 0, &start, &end, + None, &mut docids, ) .unwrap(); @@ -439,6 +469,7 @@ mod tests { 0, &start, &end, + None, &mut docids, ) .unwrap(); @@ -474,6 +505,7 @@ mod tests { 0, &start, &end, + None, &mut docids, ) .unwrap(); @@ -499,6 +531,7 @@ mod tests { 0, &start, &end, + None, &mut docids, ) .unwrap(); @@ -537,6 +570,7 @@ mod tests { 0, &start, &end, + None, &mut docids, ) .unwrap(); @@ -556,6 +590,7 @@ mod tests { 0, &start, &end, + None, &mut docids, ) .unwrap(); @@ -571,6 +606,7 @@ mod tests { 0, &Bound::Unbounded, &Bound::Unbounded, + None, &mut docids, ) .unwrap(); @@ -586,6 +622,7 @@ mod tests { 1, &Bound::Unbounded, &Bound::Unbounded, + None, &mut docids, ) .unwrap(); @@ -621,6 +658,7 @@ mod tests { 0, &start, &end, + None, &mut docids, ) .unwrap(); @@ -634,6 +672,7 @@ mod tests { 1, &start, &end, + None, &mut docids, ) .unwrap(); diff --git a/milli/src/search/facet/facet_sort_ascending.rs b/milli/src/search/facet/facet_sort_ascending.rs index 07fe64510..59a95e5bd 100644 --- a/milli/src/search/facet/facet_sort_ascending.rs +++ b/milli/src/search/facet/facet_sort_ascending.rs @@ -36,7 +36,7 @@ pub fn ascending_facet_sort<'t>( candidates: RoaringBitmap, ) -> Result> + 't> { let highest_level = get_highest_level(rtxn, db, field_id)?; - if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { + if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { let first_key = FacetGroupKey { field_id, level: highest_level, left_bound: first_bound }; let iter = db.range(rtxn, &(first_key..)).unwrap().take(usize::MAX); diff --git a/milli/src/search/facet/facet_sort_descending.rs b/milli/src/search/facet/facet_sort_descending.rs index dd2692012..29586e4e4 100644 --- a/milli/src/search/facet/facet_sort_descending.rs +++ b/milli/src/search/facet/facet_sort_descending.rs @@ -19,9 +19,9 @@ pub fn descending_facet_sort<'t>( candidates: RoaringBitmap, ) -> Result> + 't> { let highest_level = get_highest_level(rtxn, db, field_id)?; - if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { + if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { let first_key = FacetGroupKey { field_id, level: highest_level, left_bound: first_bound }; - let last_bound = get_last_facet_value::(rtxn, db, field_id)?.unwrap(); + let last_bound = get_last_facet_value::(rtxn, db, field_id)?.unwrap(); let last_key = FacetGroupKey { field_id, level: highest_level, left_bound: last_bound }; let iter = db.rev_range(rtxn, &(first_key..=last_key))?.take(usize::MAX); Ok(itertools::Either::Left(DescendingFacetSort { diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index adeec45da..f5fd0f2fd 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -224,14 +224,14 @@ impl<'a> Filter<'a> { pub fn evaluate(&self, rtxn: &heed::RoTxn, index: &Index) -> Result { // to avoid doing this for each recursive call we're going to do it ONCE ahead of time let filterable_fields = index.filterable_fields(rtxn)?; - - self.inner_evaluate(rtxn, index, &filterable_fields) + self.inner_evaluate(rtxn, index, &filterable_fields, None) } fn evaluate_operator( rtxn: &heed::RoTxn, index: &Index, field_id: FieldId, + universe: Option<&RoaringBitmap>, operator: &Condition<'a>, ) -> Result { let numbers_db = index.facet_id_f64_docids; @@ -291,14 +291,22 @@ impl<'a> Filter<'a> { } Condition::NotEqual(val) => { let operator = Condition::Equal(val.clone()); - let docids = Self::evaluate_operator(rtxn, index, field_id, &operator)?; + let docids = Self::evaluate_operator(rtxn, index, field_id, None, &operator)?; let all_ids = index.documents_ids(rtxn)?; return Ok(all_ids - docids); } }; let mut output = RoaringBitmap::new(); - Self::explore_facet_number_levels(rtxn, numbers_db, field_id, left, right, &mut output)?; + Self::explore_facet_number_levels( + rtxn, + numbers_db, + field_id, + left, + right, + universe, + &mut output, + )?; Ok(output) } @@ -310,6 +318,7 @@ impl<'a> Filter<'a> { field_id: FieldId, left: Bound, right: Bound, + universe: Option<&RoaringBitmap>, output: &mut RoaringBitmap, ) -> Result<()> { match (left, right) { @@ -321,7 +330,7 @@ impl<'a> Filter<'a> { (_, _) => (), } facet_range_search::find_docids_of_facet_within_bounds::( - rtxn, db, field_id, &left, &right, output, + rtxn, db, field_id, &left, &right, universe, output, )?; Ok(()) @@ -332,15 +341,18 @@ impl<'a> Filter<'a> { rtxn: &heed::RoTxn, index: &Index, filterable_fields: &HashSet, + universe: Option<&RoaringBitmap>, ) -> Result { match &self.condition { FilterCondition::Not(f) => { + // TODO improve the documents_ids to also support intersections at deserialize time. let all_ids = index.documents_ids(rtxn)?; let selected = Self::inner_evaluate( &(f.as_ref().clone()).into(), rtxn, index, filterable_fields, + universe, )?; Ok(all_ids - selected) } @@ -353,7 +365,8 @@ impl<'a> Filter<'a> { for el in els { let op = Condition::Equal(el.clone()); - let el_bitmap = Self::evaluate_operator(rtxn, index, fid, &op)?; + let el_bitmap = + Self::evaluate_operator(rtxn, index, fid, universe, &op)?; bitmap |= el_bitmap; } Ok(bitmap) @@ -371,7 +384,7 @@ impl<'a> Filter<'a> { if crate::is_faceted(fid.value(), filterable_fields) { let field_ids_map = index.fields_ids_map(rtxn)?; if let Some(fid) = field_ids_map.id(fid.value()) { - Self::evaluate_operator(rtxn, index, fid, op) + Self::evaluate_operator(rtxn, index, fid, universe, op) } else { Ok(RoaringBitmap::new()) } @@ -384,7 +397,8 @@ impl<'a> Filter<'a> { } FilterCondition::Or(subfilters) => subfilters .iter() - .map(|f| Self::inner_evaluate(&(f.clone()).into(), rtxn, index, filterable_fields)) + .cloned() + .map(|f| Self::inner_evaluate(&f.into(), rtxn, index, filterable_fields, universe)) .union(), FilterCondition::And(subfilters) => { let mut subfilters_iter = subfilters.iter(); @@ -394,16 +408,21 @@ impl<'a> Filter<'a> { rtxn, index, filterable_fields, + universe, )?; for f in subfilters_iter { if bitmap.is_empty() { return Ok(bitmap); } + // TODO We are doing the intersections two times, + // it could be more efficient + // Can't I just replace this `&=` by an `=`? bitmap &= Self::inner_evaluate( &(f.clone()).into(), rtxn, index, filterable_fields, + Some(&bitmap), )?; } Ok(bitmap) @@ -503,6 +522,7 @@ impl<'a> Filter<'a> { rtxn, index, filterable_fields, + universe, )?; let geo_lng_token = Token::new( @@ -535,6 +555,7 @@ impl<'a> Filter<'a> { rtxn, index, filterable_fields, + universe, )?; let condition_right = FilterCondition::Condition { @@ -548,6 +569,7 @@ impl<'a> Filter<'a> { rtxn, index, filterable_fields, + universe, )?; left | right @@ -563,6 +585,7 @@ impl<'a> Filter<'a> { rtxn, index, filterable_fields, + universe, )? }; diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index 34a9cdcb8..858028bb5 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -7,7 +7,7 @@ use roaring::RoaringBitmap; pub use self::facet_distribution::{FacetDistribution, OrderBy, DEFAULT_VALUES_PER_FACET}; pub use self::filter::{BadGeoError, Filter}; pub use self::search::{FacetValueHit, SearchForFacetValues}; -use crate::heed_codec::facet::{FacetGroupKeyCodec, FacetGroupValueCodec, OrderedF64Codec}; +use crate::heed_codec::facet::{FacetGroupKeyCodec, OrderedF64Codec}; use crate::heed_codec::BytesRefCodec; use crate::{Index, Result}; @@ -54,9 +54,9 @@ pub fn facet_max_value<'t>( } /// Get the first facet value in the facet database -pub(crate) fn get_first_facet_value<'t, BoundCodec>( +pub(crate) fn get_first_facet_value<'t, BoundCodec, DC>( txn: &'t RoTxn, - db: heed::Database, FacetGroupValueCodec>, + db: heed::Database, DC>, field_id: u16, ) -> heed::Result> where @@ -78,9 +78,9 @@ where } /// Get the last facet value in the facet database -pub(crate) fn get_last_facet_value<'t, BoundCodec>( +pub(crate) fn get_last_facet_value<'t, BoundCodec, DC>( txn: &'t RoTxn, - db: heed::Database, FacetGroupValueCodec>, + db: heed::Database, DC>, field_id: u16, ) -> heed::Result> where @@ -102,9 +102,9 @@ where } /// Get the height of the highest level in the facet database -pub(crate) fn get_highest_level<'t>( +pub(crate) fn get_highest_level<'t, DC>( txn: &'t RoTxn<'t>, - db: heed::Database, FacetGroupValueCodec>, + db: heed::Database, DC>, field_id: u16, ) -> heed::Result { let field_id_prefix = &field_id.to_be_bytes(); From 7967e93c160e88dab7c2cde2f9c3cfe4352e28c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 5 Jun 2024 16:58:32 -0400 Subject: [PATCH 055/110] Skip evaluating when a universe is empty, nothing can be found --- milli/src/search/facet/filter.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index f5fd0f2fd..d75ed5f22 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -343,6 +343,10 @@ impl<'a> Filter<'a> { filterable_fields: &HashSet, universe: Option<&RoaringBitmap>, ) -> Result { + if universe.map_or(false, |u| u.is_empty()) { + return Ok(RoaringBitmap::new()); + } + match &self.condition { FilterCondition::Not(f) => { // TODO improve the documents_ids to also support intersections at deserialize time. From 0a9bd398c7f52bce0b8dde30b9a8ddd7411dbb20 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 5 Jun 2024 16:59:00 -0400 Subject: [PATCH 056/110] Improve the NOT operator to use the universe when possible --- milli/src/search/facet/filter.rs | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index d75ed5f22..4570d4ca4 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -349,8 +349,6 @@ impl<'a> Filter<'a> { match &self.condition { FilterCondition::Not(f) => { - // TODO improve the documents_ids to also support intersections at deserialize time. - let all_ids = index.documents_ids(rtxn)?; let selected = Self::inner_evaluate( &(f.as_ref().clone()).into(), rtxn, @@ -358,7 +356,13 @@ impl<'a> Filter<'a> { filterable_fields, universe, )?; - Ok(all_ids - selected) + match universe { + Some(universe) => Ok(universe - selected), + None => { + let all_ids = index.documents_ids(rtxn)?; + Ok(all_ids - selected) + } + } } FilterCondition::In { fid, els } => { if crate::is_faceted(fid.value(), filterable_fields) { From 66470b27e678345b409702267945c9de7893383e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 5 Jun 2024 17:03:57 -0400 Subject: [PATCH 057/110] Use the MultiOps trait for IN operations --- milli/src/search/facet/filter.rs | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index 4570d4ca4..c08abc8e0 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -367,17 +367,11 @@ impl<'a> Filter<'a> { FilterCondition::In { fid, els } => { if crate::is_faceted(fid.value(), filterable_fields) { let field_ids_map = index.fields_ids_map(rtxn)?; - if let Some(fid) = field_ids_map.id(fid.value()) { - let mut bitmap = RoaringBitmap::new(); - - for el in els { - let op = Condition::Equal(el.clone()); - let el_bitmap = - Self::evaluate_operator(rtxn, index, fid, universe, &op)?; - bitmap |= el_bitmap; - } - Ok(bitmap) + els.iter() + .map(|el| Condition::Equal(el.clone())) + .map(|op| Self::evaluate_operator(rtxn, index, fid, universe, &op)) + .union() } else { Ok(RoaringBitmap::new()) } From 5432776132cf5b23032e265bc76b2bf2a04c0c2e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 5 Jun 2024 21:30:04 -0400 Subject: [PATCH 058/110] Reduce the universe while exploring the facet tree --- milli/src/search/facet/facet_range_search.rs | 59 +++++++++++++------- 1 file changed, 39 insertions(+), 20 deletions(-) diff --git a/milli/src/search/facet/facet_range_search.rs b/milli/src/search/facet/facet_range_search.rs index 0f8f58771..81fa0ef42 100644 --- a/milli/src/search/facet/facet_range_search.rs +++ b/milli/src/search/facet/facet_range_search.rs @@ -50,7 +50,7 @@ where Bound::Unbounded => Bound::Unbounded, }; let db = db.remap_types::, FacetGroupLazyValueCodec>(); - let mut f = FacetRangeSearch { rtxn, db, field_id, left, right, universe, docids }; + let mut f = FacetRangeSearch { rtxn, db, field_id, left, right, docids }; let highest_level = get_highest_level(rtxn, db, field_id)?; if let Some(starting_left_bound) = @@ -59,7 +59,7 @@ where let rightmost_bound = Bound::Included(get_last_facet_value::(rtxn, db, field_id)?.unwrap()); // will not fail because get_first_facet_value succeeded let group_size = usize::MAX; - f.run(highest_level, starting_left_bound, rightmost_bound, group_size)?; + f.run(highest_level, starting_left_bound, rightmost_bound, group_size, universe)?; Ok(()) } else { Ok(()) @@ -73,14 +73,18 @@ struct FacetRangeSearch<'t, 'b, 'bitmap> { field_id: u16, left: Bound<&'b [u8]>, right: Bound<&'b [u8]>, - /// The subset of documents ids that are useful for this search. - /// Great performance optimizations can be achieved by only fetching values matching this subset. - universe: Option<&'bitmap RoaringBitmap>, docids: &'bitmap mut RoaringBitmap, } impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { - fn run_level_0(&mut self, starting_left_bound: &'t [u8], group_size: usize) -> Result<()> { + fn run_level_0( + &mut self, + starting_left_bound: &'t [u8], + group_size: usize, + // The subset of documents ids that are useful for this search. + // Great performance optimizations can be achieved by only fetching values matching this subset. + universe: Option<&RoaringBitmap>, + ) -> Result<()> { let left_key = FacetGroupKey { field_id: self.field_id, level: 0, left_bound: starting_left_bound }; let iter = self.db.range(self.rtxn, &(left_key..))?.take(group_size); @@ -113,7 +117,7 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { } if RangeBounds::<&[u8]>::contains(&(self.left, self.right), &key.left_bound) { - *self.docids |= match self.universe { + *self.docids |= match universe { Some(universe) => CboRoaringBitmapCodec::intersection_with_serialized( value.bitmap_bytes, universe, @@ -150,9 +154,10 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { starting_left_bound: &'t [u8], rightmost_bound: Bound<&'t [u8]>, group_size: usize, + universe: Option<&RoaringBitmap>, ) -> Result<()> { if level == 0 { - return self.run_level_0(starting_left_bound, group_size); + return self.run_level_0(starting_left_bound, group_size, universe); } let left_key = @@ -209,12 +214,16 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { }; left_condition && right_condition }; + let subset = match universe { + Some(universe) => Some(CboRoaringBitmapCodec::intersection_with_serialized( + previous_value.bitmap_bytes, + universe, + )?), + None => None, + }; if should_take_whole_group { - *self.docids |= match self.universe { - Some(universe) => CboRoaringBitmapCodec::intersection_with_serialized( - previous_value.bitmap_bytes, - universe, - )?, + *self.docids |= match subset { + Some(subset) => subset, None => CboRoaringBitmapCodec::deserialize_from(previous_value.bitmap_bytes)?, }; previous_key = next_key; @@ -229,7 +238,9 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { let rightmost_bound = Bound::Excluded(next_key.left_bound); let group_size = previous_value.size as usize; - self.run(level, starting_left_bound, rightmost_bound, group_size)?; + if subset.as_ref().map_or(true, |u| !u.is_empty()) { + self.run(level, starting_left_bound, rightmost_bound, group_size, subset.as_ref())?; + } previous_key = next_key; previous_value = next_value; @@ -311,12 +322,18 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { }; left_condition && right_condition }; + + let subset = match universe { + Some(universe) => Some(CboRoaringBitmapCodec::intersection_with_serialized( + previous_value.bitmap_bytes, + universe, + )?), + None => None, + }; + if should_take_whole_group { - *self.docids |= match self.universe { - Some(universe) => CboRoaringBitmapCodec::intersection_with_serialized( - previous_value.bitmap_bytes, - universe, - )?, + *self.docids |= match subset { + Some(subset) => subset, None => CboRoaringBitmapCodec::deserialize_from(previous_value.bitmap_bytes)?, }; } else { @@ -324,7 +341,9 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { let starting_left_bound = previous_key.left_bound; let group_size = previous_value.size as usize; - self.run(level, starting_left_bound, rightmost_bound, group_size)?; + if subset.as_ref().map_or(true, |u| !u.is_empty()) { + self.run(level, starting_left_bound, rightmost_bound, group_size, subset.as_ref())?; + } } Ok(()) From 52d0d35b39096eb49e67a9801b1edc44993fa2b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 5 Jun 2024 21:58:40 -0400 Subject: [PATCH 059/110] Revert "Reduce the universe while exploring the facet tree" because it's slower this way This reverts commit 14026115f21409535772ede0ee4273f37848dd61. --- milli/src/search/facet/facet_range_search.rs | 59 +++++++------------- 1 file changed, 20 insertions(+), 39 deletions(-) diff --git a/milli/src/search/facet/facet_range_search.rs b/milli/src/search/facet/facet_range_search.rs index 81fa0ef42..0f8f58771 100644 --- a/milli/src/search/facet/facet_range_search.rs +++ b/milli/src/search/facet/facet_range_search.rs @@ -50,7 +50,7 @@ where Bound::Unbounded => Bound::Unbounded, }; let db = db.remap_types::, FacetGroupLazyValueCodec>(); - let mut f = FacetRangeSearch { rtxn, db, field_id, left, right, docids }; + let mut f = FacetRangeSearch { rtxn, db, field_id, left, right, universe, docids }; let highest_level = get_highest_level(rtxn, db, field_id)?; if let Some(starting_left_bound) = @@ -59,7 +59,7 @@ where let rightmost_bound = Bound::Included(get_last_facet_value::(rtxn, db, field_id)?.unwrap()); // will not fail because get_first_facet_value succeeded let group_size = usize::MAX; - f.run(highest_level, starting_left_bound, rightmost_bound, group_size, universe)?; + f.run(highest_level, starting_left_bound, rightmost_bound, group_size)?; Ok(()) } else { Ok(()) @@ -73,18 +73,14 @@ struct FacetRangeSearch<'t, 'b, 'bitmap> { field_id: u16, left: Bound<&'b [u8]>, right: Bound<&'b [u8]>, + /// The subset of documents ids that are useful for this search. + /// Great performance optimizations can be achieved by only fetching values matching this subset. + universe: Option<&'bitmap RoaringBitmap>, docids: &'bitmap mut RoaringBitmap, } impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { - fn run_level_0( - &mut self, - starting_left_bound: &'t [u8], - group_size: usize, - // The subset of documents ids that are useful for this search. - // Great performance optimizations can be achieved by only fetching values matching this subset. - universe: Option<&RoaringBitmap>, - ) -> Result<()> { + fn run_level_0(&mut self, starting_left_bound: &'t [u8], group_size: usize) -> Result<()> { let left_key = FacetGroupKey { field_id: self.field_id, level: 0, left_bound: starting_left_bound }; let iter = self.db.range(self.rtxn, &(left_key..))?.take(group_size); @@ -117,7 +113,7 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { } if RangeBounds::<&[u8]>::contains(&(self.left, self.right), &key.left_bound) { - *self.docids |= match universe { + *self.docids |= match self.universe { Some(universe) => CboRoaringBitmapCodec::intersection_with_serialized( value.bitmap_bytes, universe, @@ -154,10 +150,9 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { starting_left_bound: &'t [u8], rightmost_bound: Bound<&'t [u8]>, group_size: usize, - universe: Option<&RoaringBitmap>, ) -> Result<()> { if level == 0 { - return self.run_level_0(starting_left_bound, group_size, universe); + return self.run_level_0(starting_left_bound, group_size); } let left_key = @@ -214,16 +209,12 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { }; left_condition && right_condition }; - let subset = match universe { - Some(universe) => Some(CboRoaringBitmapCodec::intersection_with_serialized( - previous_value.bitmap_bytes, - universe, - )?), - None => None, - }; if should_take_whole_group { - *self.docids |= match subset { - Some(subset) => subset, + *self.docids |= match self.universe { + Some(universe) => CboRoaringBitmapCodec::intersection_with_serialized( + previous_value.bitmap_bytes, + universe, + )?, None => CboRoaringBitmapCodec::deserialize_from(previous_value.bitmap_bytes)?, }; previous_key = next_key; @@ -238,9 +229,7 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { let rightmost_bound = Bound::Excluded(next_key.left_bound); let group_size = previous_value.size as usize; - if subset.as_ref().map_or(true, |u| !u.is_empty()) { - self.run(level, starting_left_bound, rightmost_bound, group_size, subset.as_ref())?; - } + self.run(level, starting_left_bound, rightmost_bound, group_size)?; previous_key = next_key; previous_value = next_value; @@ -322,18 +311,12 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { }; left_condition && right_condition }; - - let subset = match universe { - Some(universe) => Some(CboRoaringBitmapCodec::intersection_with_serialized( - previous_value.bitmap_bytes, - universe, - )?), - None => None, - }; - if should_take_whole_group { - *self.docids |= match subset { - Some(subset) => subset, + *self.docids |= match self.universe { + Some(universe) => CboRoaringBitmapCodec::intersection_with_serialized( + previous_value.bitmap_bytes, + universe, + )?, None => CboRoaringBitmapCodec::deserialize_from(previous_value.bitmap_bytes)?, }; } else { @@ -341,9 +324,7 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { let starting_left_bound = previous_key.left_bound; let group_size = previous_value.size as usize; - if subset.as_ref().map_or(true, |u| !u.is_empty()) { - self.run(level, starting_left_bound, rightmost_bound, group_size, subset.as_ref())?; - } + self.run(level, starting_left_bound, rightmost_bound, group_size)?; } Ok(()) From 734d1c53ad2488aba411554cabe5232f8cdb1d5a Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 6 Jun 2024 16:31:07 +0200 Subject: [PATCH 060/110] fix a panic in yaup --- meilisearch/tests/common/index.rs | 2 +- meilisearch/tests/documents/get_documents.rs | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/meilisearch/tests/common/index.rs b/meilisearch/tests/common/index.rs index f81fe8c8a..114ede9b8 100644 --- a/meilisearch/tests/common/index.rs +++ b/meilisearch/tests/common/index.rs @@ -429,6 +429,6 @@ impl Index<'_> { pub struct GetAllDocumentsOptions { pub limit: Option, pub offset: Option, - pub fields: Option>, pub retrieve_vectors: bool, + pub fields: Option>, } diff --git a/meilisearch/tests/documents/get_documents.rs b/meilisearch/tests/documents/get_documents.rs index 1ade00b06..3bf3727c4 100644 --- a/meilisearch/tests/documents/get_documents.rs +++ b/meilisearch/tests/documents/get_documents.rs @@ -223,9 +223,7 @@ async fn test_get_all_documents_attributes_to_retrieve() { assert_eq!(response["limit"], json!(20)); assert_eq!(response["total"], json!(77)); - let (response, code) = index - .get_all_documents(GetAllDocumentsOptions { fields: Some(vec![]), ..Default::default() }) - .await; + let (response, code) = index.get_all_documents_raw("?fields=").await; assert_eq!(code, 200); assert_eq!(response["results"].as_array().unwrap().len(), 20); for results in response["results"].as_array().unwrap() { From 40f05fe15693cb8cfea35af7de887447406b25cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 6 Jun 2024 10:59:55 -0400 Subject: [PATCH 061/110] Bump roaring to the latest commit --- Cargo.lock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index e72d72251..88af93bd4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4395,7 +4395,7 @@ dependencies = [ [[package]] name = "roaring" version = "0.10.4" -source = "git+https://github.com/RoaringBitmap/roaring-rs?branch=intersection-with-serialized#4466ae0104ed44a8cf41d187d9359483fe190701" +source = "git+https://github.com/RoaringBitmap/roaring-rs?branch=intersection-with-serialized#88b848b84cf7c8cc8d2ea02dfff77b5a54d822ec" dependencies = [ "bytemuck", "byteorder", From 75b2e02cd2e3b46d0f0494c6da822940c9fe98cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 6 Jun 2024 11:00:07 -0400 Subject: [PATCH 062/110] Log more stuff around filtering --- milli/src/search/new/mod.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 623c72567..52eb7ffea 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -548,6 +548,7 @@ fn resolve_sort_criteria<'ctx, Query: RankingRuleQueryTrait>( Ok(()) } +#[tracing::instrument(level = "trace", skip_all, target = "search")] pub fn filtered_universe( index: &Index, txn: &RoTxn<'_>, From 57d066595b47d47669b626c5040a77c7b458b1f4 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 6 Jun 2024 17:24:50 +0200 Subject: [PATCH 063/110] fix Tests almost all features --- meilisearch/tests/search/mod.rs | 2 +- milli/src/search/new/tests/stop_words.rs | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/meilisearch/tests/search/mod.rs b/meilisearch/tests/search/mod.rs index b65c0dc42..e80c5144d 100644 --- a/meilisearch/tests/search/mod.rs +++ b/meilisearch/tests/search/mod.rs @@ -301,7 +301,7 @@ async fn negative_special_cases_search() { index.add_documents(documents, None).await; index.wait_task(0).await; - index.update_settings(json!({"synonyms": { "escape": ["glass"] }})).await; + index.update_settings(json!({"synonyms": { "escape": ["gläss"] }})).await; index.wait_task(1).await; // There is a synonym for escape -> glass but we don't want "escape", only the derivates: glass diff --git a/milli/src/search/new/tests/stop_words.rs b/milli/src/search/new/tests/stop_words.rs index 629751b48..dc1e45fce 100644 --- a/milli/src/search/new/tests/stop_words.rs +++ b/milli/src/search/new/tests/stop_words.rs @@ -13,7 +13,7 @@ use std::collections::BTreeSet; use std::iter::FromIterator; use crate::index::tests::TempIndex; -use crate::{db_snap, Search, SearchResult, TermsMatchingStrategy}; +use crate::{Search, SearchResult, TermsMatchingStrategy}; fn create_index() -> TempIndex { let index = TempIndex::new(); @@ -66,9 +66,10 @@ fn create_index() -> TempIndex { } #[test] +#[cfg(not(feature = "swedish-recomposition"))] fn test_stop_words_not_indexed() { let index = create_index(); - db_snap!(index, word_docids, @"6288f9d7db3703b02c57025eb4a69264"); + crate::db_snap!(index, word_docids, @"6288f9d7db3703b02c57025eb4a69264"); } #[test] From 8ec6e175e52c1050566ea4460793f073508b83f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 7 Jun 2024 22:11:26 -0400 Subject: [PATCH 064/110] Replace roaring patch to the v0.10.5 --- Cargo.lock | 5 +++-- Cargo.toml | 3 --- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 88af93bd4..4417af63a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4394,8 +4394,9 @@ dependencies = [ [[package]] name = "roaring" -version = "0.10.4" -source = "git+https://github.com/RoaringBitmap/roaring-rs?branch=intersection-with-serialized#88b848b84cf7c8cc8d2ea02dfff77b5a54d822ec" +version = "0.10.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7699249cc2c7d71939f30868f47e9d7add0bdc030d90ee10bfd16887ff8bb1c8" dependencies = [ "bytemuck", "byteorder", diff --git a/Cargo.toml b/Cargo.toml index f49d1fd44..5c6c8b376 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -64,6 +64,3 @@ opt-level = 3 opt-level = 3 [profile.bench.package.yada] opt-level = 3 - -[patch.crates-io] -roaring = { git = "https://github.com/RoaringBitmap/roaring-rs", branch = "intersection-with-serialized" } From 19585f1a4ffccbe6df51b94c70b818427c22eae4 Mon Sep 17 00:00:00 2001 From: dureuill Date: Mon, 10 Jun 2024 07:59:36 +0000 Subject: [PATCH 065/110] Update version for the next release (v1.8.2) in Cargo.toml --- Cargo.lock | 34 +++++++++++++++++----------------- Cargo.toml | 2 +- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ab060ac93..b7419052a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -494,7 +494,7 @@ checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b" [[package]] name = "benchmarks" -version = "1.8.1" +version = "1.8.2" dependencies = [ "anyhow", "bytes", @@ -639,7 +639,7 @@ dependencies = [ [[package]] name = "build-info" -version = "1.8.1" +version = "1.8.2" dependencies = [ "anyhow", "time", @@ -1539,7 +1539,7 @@ dependencies = [ [[package]] name = "dump" -version = "1.8.1" +version = "1.8.2" dependencies = [ "anyhow", "big_s", @@ -1787,7 +1787,7 @@ dependencies = [ [[package]] name = "file-store" -version = "1.8.1" +version = "1.8.2" dependencies = [ "faux", "tempfile", @@ -1810,7 +1810,7 @@ dependencies = [ [[package]] name = "filter-parser" -version = "1.8.1" +version = "1.8.2" dependencies = [ "insta", "nom", @@ -1830,7 +1830,7 @@ dependencies = [ [[package]] name = "flatten-serde-json" -version = "1.8.1" +version = "1.8.2" dependencies = [ "criterion", "serde_json", @@ -1948,7 +1948,7 @@ dependencies = [ [[package]] name = "fuzzers" -version = "1.8.1" +version = "1.8.2" dependencies = [ "arbitrary", "clap", @@ -2442,7 +2442,7 @@ checksum = "206ca75c9c03ba3d4ace2460e57b189f39f43de612c2f85836e65c929701bb2d" [[package]] name = "index-scheduler" -version = "1.8.1" +version = "1.8.2" dependencies = [ "anyhow", "big_s", @@ -2638,7 +2638,7 @@ dependencies = [ [[package]] name = "json-depth-checker" -version = "1.8.1" +version = "1.8.2" dependencies = [ "criterion", "serde_json", @@ -3275,7 +3275,7 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771" [[package]] name = "meili-snap" -version = "1.8.1" +version = "1.8.2" dependencies = [ "insta", "md5", @@ -3284,7 +3284,7 @@ dependencies = [ [[package]] name = "meilisearch" -version = "1.8.1" +version = "1.8.2" dependencies = [ "actix-cors", "actix-http", @@ -3377,7 +3377,7 @@ dependencies = [ [[package]] name = "meilisearch-auth" -version = "1.8.1" +version = "1.8.2" dependencies = [ "base64 0.21.7", "enum-iterator", @@ -3396,7 +3396,7 @@ dependencies = [ [[package]] name = "meilisearch-types" -version = "1.8.1" +version = "1.8.2" dependencies = [ "actix-web", "anyhow", @@ -3426,7 +3426,7 @@ dependencies = [ [[package]] name = "meilitool" -version = "1.8.1" +version = "1.8.2" dependencies = [ "anyhow", "clap", @@ -3465,7 +3465,7 @@ dependencies = [ [[package]] name = "milli" -version = "1.8.1" +version = "1.8.2" dependencies = [ "arroy", "big_s", @@ -3906,7 +3906,7 @@ checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" [[package]] name = "permissive-json-pointer" -version = "1.8.1" +version = "1.8.2" dependencies = [ "big_s", "serde_json", @@ -6074,7 +6074,7 @@ dependencies = [ [[package]] name = "xtask" -version = "1.8.1" +version = "1.8.2" dependencies = [ "anyhow", "build-info", diff --git a/Cargo.toml b/Cargo.toml index eadef3a1b..486e24668 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,7 +22,7 @@ members = [ ] [workspace.package] -version = "1.8.1" +version = "1.8.2" authors = [ "Quentin de Quelen ", "Clément Renault ", From 50f8218a5d877fd0309cf6432ad5e6d204dc9a68 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 5 Jun 2024 18:26:36 +0200 Subject: [PATCH 066/110] Asynchronously drop permits --- meilisearch/src/search_queue.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/meilisearch/src/search_queue.rs b/meilisearch/src/search_queue.rs index 6d5044d20..152b900be 100644 --- a/meilisearch/src/search_queue.rs +++ b/meilisearch/src/search_queue.rs @@ -40,8 +40,9 @@ pub struct Permit { impl Drop for Permit { fn drop(&mut self) { + let sender = self.sender.clone(); // if the channel is closed then the whole instance is down - let _ = futures::executor::block_on(self.sender.send(())); + std::mem::drop(tokio::spawn(async move { sender.send(()).await })); } } From 0502b175017119610dac4034eb3524d7b551912f Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 10 Jun 2024 10:52:49 +0200 Subject: [PATCH 067/110] log the state of the index-scheduler in all failed tests --- index-scheduler/src/lib.rs | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index 50fc619d8..c5ae1c31f 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -1836,6 +1836,7 @@ mod tests { assert_eq!(breakpoint, (Init, false)); let index_scheduler_handle = IndexSchedulerHandle { _tempdir: tempdir, + index_scheduler: index_scheduler.private_clone(), test_breakpoint_rcv: receiver, last_breakpoint: breakpoint.0, }; @@ -1924,6 +1925,7 @@ mod tests { pub struct IndexSchedulerHandle { _tempdir: TempDir, + index_scheduler: IndexScheduler, test_breakpoint_rcv: crossbeam::channel::Receiver<(Breakpoint, bool)>, last_breakpoint: Breakpoint, } @@ -1941,9 +1943,13 @@ mod tests { { Ok(b) => b, Err(RecvTimeoutError::Timeout) => { - panic!("The scheduler seems to be waiting for a new task while your test is waiting for a breakpoint.") + let state = snapshot_index_scheduler(&self.index_scheduler); + panic!("The scheduler seems to be waiting for a new task while your test is waiting for a breakpoint.\n{state}") + } + Err(RecvTimeoutError::Disconnected) => { + let state = snapshot_index_scheduler(&self.index_scheduler); + panic!("The scheduler crashed.\n{state}") } - Err(RecvTimeoutError::Disconnected) => panic!("The scheduler crashed."), }; // if we've already encountered a breakpoint we're supposed to be stuck on the false // and we expect the same variant with the true to come now. @@ -1962,9 +1968,13 @@ mod tests { { Ok(b) => b, Err(RecvTimeoutError::Timeout) => { - panic!("The scheduler seems to be waiting for a new task while your test is waiting for a breakpoint.") + let state = snapshot_index_scheduler(&self.index_scheduler); + panic!("The scheduler seems to be waiting for a new task while your test is waiting for a breakpoint.\n{state}") + } + Err(RecvTimeoutError::Disconnected) => { + let state = snapshot_index_scheduler(&self.index_scheduler); + panic!("The scheduler crashed.\n{state}") } - Err(RecvTimeoutError::Disconnected) => panic!("The scheduler crashed."), }; assert!(!b, "Found the breakpoint handle in a bad state. Check your test suite"); @@ -1978,9 +1988,10 @@ mod tests { fn advance_till(&mut self, breakpoints: impl IntoIterator) { for breakpoint in breakpoints { let b = self.advance(); + let state = snapshot_index_scheduler(&self.index_scheduler); assert_eq!( b, breakpoint, - "Was expecting the breakpoint `{:?}` but instead got `{:?}`.", + "Was expecting the breakpoint `{:?}` but instead got `{:?}`.\n{state}", breakpoint, b ); } @@ -2013,8 +2024,8 @@ mod tests { InsideProcessBatch => (), // the batch went successfully, we can stop the loop and go on with the next states. ProcessBatchSucceeded => break, - AbortedIndexation => panic!("The batch was aborted."), - ProcessBatchFailed => panic!("The batch failed."), + AbortedIndexation => panic!("The batch was aborted.\n{}", snapshot_index_scheduler(&self.index_scheduler)), + ProcessBatchFailed => panic!("The batch failed.\n{}", snapshot_index_scheduler(&self.index_scheduler)), breakpoint => panic!("Encountered an impossible breakpoint `{:?}`, this is probably an issue with the test suite.", breakpoint), } } @@ -2033,8 +2044,8 @@ mod tests { InsideProcessBatch => (), // the batch went failed, we can stop the loop and go on with the next states. ProcessBatchFailed => break, - ProcessBatchSucceeded => panic!("The batch succeeded. (and it wasn't supposed to sorry)"), - AbortedIndexation => panic!("The batch was aborted."), + ProcessBatchSucceeded => panic!("The batch succeeded. (and it wasn't supposed to sorry)\n{}", snapshot_index_scheduler(&self.index_scheduler)), + AbortedIndexation => panic!("The batch was aborted.\n{}", snapshot_index_scheduler(&self.index_scheduler)), breakpoint => panic!("Encountered an impossible breakpoint `{:?}`, this is probably an issue with the test suite.", breakpoint), } } From 600e97d9dcec39588bd4c305a607fc0025620e15 Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 10 Jun 2024 18:26:12 +0200 Subject: [PATCH 068/110] gate the retrieveVectors parameter behind the vectors feature flag --- meilisearch/src/routes/indexes/documents.rs | 24 +++++++++--- meilisearch/src/routes/indexes/search.rs | 8 ++-- meilisearch/tests/documents/errors.rs | 43 +++++++++++++++++++++ meilisearch/tests/search/mod.rs | 41 ++++++++++++++------ 4 files changed, 96 insertions(+), 20 deletions(-) diff --git a/meilisearch/src/routes/indexes/documents.rs b/meilisearch/src/routes/indexes/documents.rs index 81e297d54..70623bb35 100644 --- a/meilisearch/src/routes/indexes/documents.rs +++ b/meilisearch/src/routes/indexes/documents.rs @@ -110,14 +110,18 @@ pub async fn get_document( debug!(parameters = ?params, "Get document"); let index_uid = IndexUid::try_from(index_uid)?; - analytics.get_fetch_documents( - &DocumentFetchKind::PerDocumentId { retrieve_vectors: params.retrieve_vectors.0 }, - &req, - ); - let GetDocument { fields, retrieve_vectors } = params.into_inner(); let attributes_to_retrieve = fields.merge_star_and_none(); + let features = index_scheduler.features(); + if retrieve_vectors.0 { + features.check_vector("Passing `retrieveVectors` as a parameter")?; + } + analytics.get_fetch_documents( + &DocumentFetchKind::PerDocumentId { retrieve_vectors: retrieve_vectors.0 }, + &req, + ); + let index = index_scheduler.index(&index_uid)?; let document = retrieve_document(&index, &document_id, attributes_to_retrieve, retrieve_vectors.0)?; @@ -191,6 +195,11 @@ pub async fn documents_by_query_post( let body = body.into_inner(); debug!(parameters = ?body, "Get documents POST"); + let features = index_scheduler.features(); + if body.retrieve_vectors { + features.check_vector("Passing `retrieveVectors` as a parameter")?; + } + analytics.post_fetch_documents( &DocumentFetchKind::Normal { with_filter: body.filter.is_some(), @@ -215,6 +224,11 @@ pub async fn get_documents( let BrowseQueryGet { limit, offset, fields, retrieve_vectors, filter } = params.into_inner(); + let features = index_scheduler.features(); + if retrieve_vectors.0 { + features.check_vector("Passing `retrieveVectors` as a parameter")?; + } + let filter = match filter { Some(f) => match serde_json::from_str(&f) { Ok(v) => Some(v), diff --git a/meilisearch/src/routes/indexes/search.rs b/meilisearch/src/routes/indexes/search.rs index ae6402cf6..6fdff4568 100644 --- a/meilisearch/src/routes/indexes/search.rs +++ b/meilisearch/src/routes/indexes/search.rs @@ -290,11 +290,13 @@ pub fn search_kind( features: RoFeatures, ) -> Result { if query.vector.is_some() { - features.check_vector("Passing `vector` as a query parameter")?; + features.check_vector("Passing `vector` as a parameter")?; } - if query.hybrid.is_some() { - features.check_vector("Passing `hybrid` as a query parameter")?; + features.check_vector("Passing `hybrid` as a parameter")?; + } + if query.retrieve_vectors { + features.check_vector("Passing `retrieveVectors` as a parameter")?; } // regardless of anything, always do a keyword search when we don't have a vector and the query is whitespace or missing diff --git a/meilisearch/tests/documents/errors.rs b/meilisearch/tests/documents/errors.rs index cd1be4dc4..8e9a3a696 100644 --- a/meilisearch/tests/documents/errors.rs +++ b/meilisearch/tests/documents/errors.rs @@ -800,6 +800,8 @@ async fn fetch_document_by_filter() { async fn retrieve_vectors() { let server = Server::new().await; let index = server.index("doggo"); + + // GET ALL DOCUMENTS BY QUERY let (response, _code) = index.get_all_documents_raw("?retrieveVectors=tamo").await; snapshot!(json_string!(response), @r###" { @@ -809,6 +811,38 @@ async fn retrieve_vectors() { "link": "https://docs.meilisearch.com/errors#invalid_document_retrieve_vectors" } "###); + let (response, _code) = index.get_all_documents_raw("?retrieveVectors=true").await; + snapshot!(json_string!(response), @r###" + { + "message": "Passing `retrieveVectors` as a parameter requires enabling the `vector store` experimental feature. See https://github.com/meilisearch/product/discussions/677", + "code": "feature_not_enabled", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#feature_not_enabled" + } + "###); + + // FETCH ALL DOCUMENTS BY POST + let (response, _code) = + index.get_document_by_filter(json!({ "retrieveVectors": "tamo" })).await; + snapshot!(json_string!(response), @r###" + { + "message": "Invalid value type at `.retrieveVectors`: expected a boolean, but found a string: `\"tamo\"`", + "code": "invalid_document_retrieve_vectors", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_document_retrieve_vectors" + } + "###); + let (response, _code) = index.get_document_by_filter(json!({ "retrieveVectors": true })).await; + snapshot!(json_string!(response), @r###" + { + "message": "Passing `retrieveVectors` as a parameter requires enabling the `vector store` experimental feature. See https://github.com/meilisearch/product/discussions/677", + "code": "feature_not_enabled", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#feature_not_enabled" + } + "###); + + // GET A SINGLE DOCUMENT let (response, _code) = index.get_document(0, Some(json!({"retrieveVectors": "tamo"}))).await; snapshot!(json_string!(response), @r###" { @@ -818,4 +852,13 @@ async fn retrieve_vectors() { "link": "https://docs.meilisearch.com/errors#invalid_document_retrieve_vectors" } "###); + let (response, _code) = index.get_document(0, Some(json!({"retrieveVectors": true}))).await; + snapshot!(json_string!(response), @r###" + { + "message": "Passing `retrieveVectors` as a parameter requires enabling the `vector store` experimental feature. See https://github.com/meilisearch/product/discussions/677", + "code": "feature_not_enabled", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#feature_not_enabled" + } + "###); } diff --git a/meilisearch/tests/search/mod.rs b/meilisearch/tests/search/mod.rs index 9e19fa4e8..19e495edd 100644 --- a/meilisearch/tests/search/mod.rs +++ b/meilisearch/tests/search/mod.rs @@ -1290,21 +1290,38 @@ async fn experimental_feature_vector_store() { index.add_documents(json!(documents), None).await; index.wait_task(0).await; - let (response, code) = index - .search_post(json!({ + index + .search(json!({ "vector": [1.0, 2.0, 3.0], "showRankingScore": true - })) + }), |response, code|{ + meili_snap::snapshot!(code, @"400 Bad Request"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "message": "Passing `vector` as a parameter requires enabling the `vector store` experimental feature. See https://github.com/meilisearch/product/discussions/677", + "code": "feature_not_enabled", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#feature_not_enabled" + } + "###); + }) + .await; + index + .search(json!({ + "retrieveVectors": true, + "showRankingScore": true + }), |response, code|{ + meili_snap::snapshot!(code, @"400 Bad Request"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "message": "Passing `retrieveVectors` as a parameter requires enabling the `vector store` experimental feature. See https://github.com/meilisearch/product/discussions/677", + "code": "feature_not_enabled", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#feature_not_enabled" + } + "###); + }) .await; - meili_snap::snapshot!(code, @"400 Bad Request"); - meili_snap::snapshot!(meili_snap::json_string!(response), @r###" - { - "message": "Passing `vector` as a query parameter requires enabling the `vector store` experimental feature. See https://github.com/meilisearch/product/discussions/677", - "code": "feature_not_enabled", - "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#feature_not_enabled" - } - "###); let (response, code) = server.set_features(json!({"vectorStore": true})).await; meili_snap::snapshot!(code, @"200 OK"); From 7cef2299cf0642d846246e6687193484f8f7fc03 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 11 Jun 2024 09:42:45 +0200 Subject: [PATCH 069/110] Fix behavior when removing a document --- milli/src/update/clear_documents.rs | 7 +++++++ .../index_documents/extract/extract_vector_points.rs | 1 + 2 files changed, 8 insertions(+) diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index 3490b55e4..9eca378a5 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -64,6 +64,13 @@ impl<'t, 'i> ClearDocuments<'t, 'i> { self.index.delete_geo_rtree(self.wtxn)?; self.index.delete_geo_faceted_documents_ids(self.wtxn)?; + // Remove all user-provided bits from the configs + let mut configs = self.index.embedding_configs(self.wtxn)?; + for config in configs.iter_mut() { + config.user_provided.clear(); + } + self.index.put_embedding_configs(self.wtxn, configs)?; + // Clear the other databases. external_documents_ids.clear(self.wtxn)?; word_docids.clear(self.wtxn)?; diff --git a/milli/src/update/index_documents/extract/extract_vector_points.rs b/milli/src/update/index_documents/extract/extract_vector_points.rs index 964cb35e8..48e3e697a 100644 --- a/milli/src/update/index_documents/extract/extract_vector_points.rs +++ b/milli/src/update/index_documents/extract/extract_vector_points.rs @@ -225,6 +225,7 @@ pub fn extract_vector_points( } else if document_is_kept && old.is_none() { VectorStateDelta::NoChange } else { + remove_from_user_provided.insert(docid); VectorStateDelta::NowRemoved } } From 3493093c4f4df2889c5fc895fd372f7e5ea2cf50 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 11 Jun 2024 16:03:45 +0200 Subject: [PATCH 070/110] add a batch of tests --- index-scheduler/src/lib.rs | 176 +++++++++++++++++++++++++-- meilisearch/tests/vector/mod.rs | 78 ++++++++++++ meilisearch/tests/vector/settings.rs | 161 ++++++++++++++++++++++++ 3 files changed, 407 insertions(+), 8 deletions(-) create mode 100644 meilisearch/tests/vector/settings.rs diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index c5ae1c31f..e2a6f03a0 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -2016,6 +2016,7 @@ mod tests { // Wait for one successful batch. #[track_caller] fn advance_one_successful_batch(&mut self) { + self.index_scheduler.assert_internally_consistent(); self.advance_till([Start, BatchCreated]); loop { match self.advance() { @@ -2025,12 +2026,16 @@ mod tests { // the batch went successfully, we can stop the loop and go on with the next states. ProcessBatchSucceeded => break, AbortedIndexation => panic!("The batch was aborted.\n{}", snapshot_index_scheduler(&self.index_scheduler)), - ProcessBatchFailed => panic!("The batch failed.\n{}", snapshot_index_scheduler(&self.index_scheduler)), + ProcessBatchFailed => { + while self.advance() != Start {} + panic!("The batch failed.\n{}", snapshot_index_scheduler(&self.index_scheduler)) + }, breakpoint => panic!("Encountered an impossible breakpoint `{:?}`, this is probably an issue with the test suite.", breakpoint), } } self.advance_till([AfterProcessing]); + self.index_scheduler.assert_internally_consistent(); } // Wait for one failed batch. @@ -5012,7 +5017,6 @@ mod tests { false, ) .unwrap(); - index_scheduler.assert_internally_consistent(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_registering_settings_task_vectors"); @@ -5105,7 +5109,6 @@ mod tests { false, ) .unwrap(); - index_scheduler.assert_internally_consistent(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after adding Intel"); @@ -5180,7 +5183,6 @@ mod tests { false, ) .unwrap(); - index_scheduler.assert_internally_consistent(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "Intel to kefir"); @@ -5303,9 +5305,7 @@ mod tests { false, ) .unwrap(); - index_scheduler.assert_internally_consistent(); handle.advance_one_successful_batch(); - index_scheduler.assert_internally_consistent(); let index = index_scheduler.index("doggos").unwrap(); let rtxn = index.read_txn().unwrap(); @@ -5452,9 +5452,7 @@ mod tests { false, ) .unwrap(); - index_scheduler.assert_internally_consistent(); handle.advance_one_successful_batch(); - index_scheduler.assert_internally_consistent(); // the document with the id 3 should have its original embedding updated let rtxn = index.read_txn().unwrap(); @@ -5481,4 +5479,166 @@ mod tests { assert!(!embedding.is_empty()); } + + #[test] + fn delete_document_containing_vector() { + // 1. Add an embedder + // 2. Push two documents containing a simple vector + // 3. Delete the first document + // 4. The user defined roaring bitmap shouldn't contains the id of the first document anymore + // 5. Clear the index + // 6. The user defined roaring bitmap shouldn't contains the id of the second document + let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); + + let setting = meilisearch_types::settings::Settings:: { + embedders: Setting::Set(maplit::btreemap! { + S("manual") => Setting::Set(EmbeddingSettings { + source: Setting::Set(milli::vector::settings::EmbedderSource::UserProvided), + dimensions: Setting::Set(3), + ..Default::default() + }) + }), + ..Default::default() + }; + index_scheduler + .register( + KindWithContent::SettingsUpdate { + index_uid: S("doggos"), + new_settings: Box::new(setting), + is_deletion: false, + allow_index_creation: true, + }, + None, + false, + ) + .unwrap(); + handle.advance_one_successful_batch(); + + let content = serde_json::json!( + [ + { + "id": 0, + "doggo": "kefir", + "_vectors": { + "manual": vec![0, 0, 0], + } + }, + { + "id": 1, + "doggo": "intel", + "_vectors": { + "manual": vec![1, 1, 1], + } + }, + ] + ); + + let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0_u128).unwrap(); + let documents_count = + read_json(serde_json::to_string_pretty(&content).unwrap().as_bytes(), &mut file) + .unwrap(); + snapshot!(documents_count, @"2"); + file.persist().unwrap(); + + index_scheduler + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: None, + method: ReplaceDocuments, + content_file: uuid, + documents_count, + allow_index_creation: false, + }, + None, + false, + ) + .unwrap(); + handle.advance_one_successful_batch(); + + index_scheduler + .register( + KindWithContent::DocumentDeletion { + index_uid: S("doggos"), + documents_ids: vec![S("1")], + }, + None, + false, + ) + .unwrap(); + handle.advance_one_successful_batch(); + + let index = index_scheduler.index("doggos").unwrap(); + let rtxn = index.read_txn().unwrap(); + let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let field_ids = field_ids_map.ids().collect::>(); + let documents = index + .all_documents(&rtxn) + .unwrap() + .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) + .collect::>(); + snapshot!(serde_json::to_string(&documents).unwrap(), @r###"[{"id":0,"doggo":"kefir"}]"###); + let conf = index.embedding_configs(&rtxn).unwrap(); + // TODO: Here the user provided vectors should NOT contains 1 + snapshot!(format!("{conf:#?}"), @r###" + [ + IndexEmbeddingConfig { + name: "manual", + config: EmbeddingConfig { + embedder_options: UserProvided( + EmbedderOptions { + dimensions: 3, + distribution: None, + }, + ), + prompt: PromptData { + template: "{% for field in fields %} {{ field.name }}: {{ field.value }}\n{% endfor %}", + }, + }, + user_provided: RoaringBitmap<[0, 1]>, + }, + ] + "###); + let docid = index.external_documents_ids.get(&rtxn, "0").unwrap().unwrap(); + let embeddings = index.embeddings(&rtxn, docid).unwrap(); + let embedding = &embeddings["manual"]; + assert!(!embedding.is_empty(), "{embedding:?}"); + + index_scheduler + .register(KindWithContent::DocumentClear { index_uid: S("doggos") }, None, false) + .unwrap(); + handle.advance_one_successful_batch(); + + let index = index_scheduler.index("doggos").unwrap(); + let rtxn = index.read_txn().unwrap(); + let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let field_ids = field_ids_map.ids().collect::>(); + let documents = index + .all_documents(&rtxn) + .unwrap() + .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) + .collect::>(); + snapshot!(serde_json::to_string(&documents).unwrap(), @"[]"); + let conf = index.embedding_configs(&rtxn).unwrap(); + // TODO: Here the user provided vectors should contains nothing + snapshot!(format!("{conf:#?}"), @r###" + [ + IndexEmbeddingConfig { + name: "manual", + config: EmbeddingConfig { + embedder_options: UserProvided( + EmbedderOptions { + dimensions: 3, + distribution: None, + }, + ), + prompt: PromptData { + template: "{% for field in fields %} {{ field.name }}: {{ field.value }}\n{% endfor %}", + }, + }, + user_provided: RoaringBitmap<[0, 1]>, + }, + ] + "###); + } } diff --git a/meilisearch/tests/vector/mod.rs b/meilisearch/tests/vector/mod.rs index b4350116f..55dc186d5 100644 --- a/meilisearch/tests/vector/mod.rs +++ b/meilisearch/tests/vector/mod.rs @@ -1,5 +1,8 @@ +mod settings; + use meili_snap::{json_string, snapshot}; +use crate::common::index::Index; use crate::common::{GetAllDocumentsOptions, Server}; use crate::json; @@ -147,3 +150,78 @@ async fn add_remove_user_provided() { } "###); } + +async fn generate_default_user_provided_documents(server: &Server) -> Index { + let index = server.index("doggo"); + let (value, code) = server.set_features(json!({"vectorStore": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(value, @r###" + { + "vectorStore": true, + "metrics": false, + "logsRoute": false + } + "###); + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "manual": { + "source": "userProvided", + "dimensions": 3, + } + }, + })) + .await; + snapshot!(code, @"202 Accepted"); + server.wait_task(response.uid()).await; + + let documents = json!([ + {"id": 0, "name": "kefir", "_vectors": { "manual": [0, 0, 0] }}, + {"id": 1, "name": "echo", "_vectors": { "manual": [1, 1, 1] }}, + {"id": 2, "name": "billou", "_vectors": { "manual": [[2, 2, 2], [2, 2, 3]] }}, + {"id": 3, "name": "intel", "_vectors": { "manual": { "userProvided": true, "embeddings": [3, 3, 3] }}}, + {"id": 4, "name": "max", "_vectors": { "manual": { "userProvided": true, "embeddings": [[4, 4, 4], [4, 4, 5]] }}}, + ]); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + index.wait_task(value.uid()).await; + + index +} + +#[actix_rt::test] +async fn clear_documents() { + let server = Server::new().await; + let index = generate_default_user_provided_documents(&server).await; + + let (value, _code) = index.clear_all_documents().await; + index.wait_task(value.uid()).await; + + // Make sure the documents DB has been cleared + let (documents, _code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; + snapshot!(json_string!(documents), @r###" + { + "results": [], + "offset": 0, + "limit": 20, + "total": 0 + } + "###); + + // Make sure the arroy DB has been cleared + let (documents, _code) = index.search_post(json!({ "vector": [1, 1, 1] })).await; + snapshot!(json_string!(documents), @r###" + { + "hits": [], + "query": "", + "processingTimeMs": 0, + "limit": 20, + "offset": 0, + "estimatedTotalHits": 0, + "semanticHitCount": 0 + } + "###); +} diff --git a/meilisearch/tests/vector/settings.rs b/meilisearch/tests/vector/settings.rs new file mode 100644 index 000000000..6b93f001e --- /dev/null +++ b/meilisearch/tests/vector/settings.rs @@ -0,0 +1,161 @@ +use meili_snap::{json_string, snapshot}; + +use crate::common::{GetAllDocumentsOptions, Server}; +use crate::json; +use crate::vector::generate_default_user_provided_documents; + +#[actix_rt::test] +async fn update_embedder() { + let server = Server::new().await; + let index = server.index("doggo"); + let (value, code) = server.set_features(json!({"vectorStore": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(value, @r###" + { + "vectorStore": true, + "metrics": false, + "logsRoute": false + } + "###); + + let (response, code) = index + .update_settings(json!({ + "embedders": { "manual": {}}, + })) + .await; + snapshot!(code, @"202 Accepted"); + server.wait_task(response.uid()).await; + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "manual": { + "source": "userProvided", + "dimensions": 2, + } + }, + })) + .await; + snapshot!(code, @"202 Accepted"); + + let ret = server.wait_task(response.uid()).await; + snapshot!(ret, @r###" + { + "uid": 1, + "indexUid": "doggo", + "status": "failed", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "manual": { + "source": "userProvided", + "dimensions": 2 + } + } + }, + "error": { + "message": "`.embedders.manual`: Field `model` unavailable for source `userProvided` (only available for sources: `huggingFace`, `openAi`, `ollama`). Available fields: `source`, `dimensions`, `distribution`", + "code": "invalid_settings_embedders", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_settings_embedders" + }, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); +} + +#[actix_rt::test] +async fn reset_embedder_documents() { + let server = Server::new().await; + let index = generate_default_user_provided_documents(&server).await; + + let (response, code) = index.delete_settings().await; + snapshot!(code, @"202 Accepted"); + server.wait_task(response.uid()).await; + + // Make sure the documents are still present + let (documents, _code) = index.get_all_documents(Default::default()).await; + snapshot!(json_string!(documents), @r###" + { + "results": [ + { + "id": 0, + "name": "kefir" + }, + { + "id": 1, + "name": "echo" + }, + { + "id": 2, + "name": "billou" + }, + { + "id": 3, + "name": "intel" + }, + { + "id": 4, + "name": "max" + } + ], + "offset": 0, + "limit": 20, + "total": 5 + } + "###); + + // Make sure we are still able to retrieve their vectors + let (documents, _code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; + snapshot!(json_string!(documents), @r###" + { + "results": [ + { + "id": 0, + "name": "kefir", + "_vectors": {} + }, + { + "id": 1, + "name": "echo", + "_vectors": {} + }, + { + "id": 2, + "name": "billou", + "_vectors": {} + }, + { + "id": 3, + "name": "intel", + "_vectors": {} + }, + { + "id": 4, + "name": "max", + "_vectors": {} + } + ], + "offset": 0, + "limit": 20, + "total": 5 + } + "###); + + // Make sure the arroy DB has been cleared + let (documents, _code) = index.search_post(json!({ "vector": [1, 1, 1] })).await; + snapshot!(json_string!(documents), @r###" + { + "message": "Cannot find embedder with name `default`.", + "code": "invalid_embedder", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_embedder" + } + "###); +} From 0d31be149437dfaa6133e8971c4a333cf82bd4d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 11 Jun 2024 11:39:35 -0400 Subject: [PATCH 071/110] Make the distinct work at search --- meilisearch/src/analytics/segment_analytics.rs | 14 ++++++++++++++ meilisearch/src/routes/indexes/facet_search.rs | 1 + meilisearch/src/routes/indexes/search.rs | 4 ++++ meilisearch/src/search.rs | 17 +++++++++++++++++ milli/examples/search.rs | 1 + milli/src/search/hybrid.rs | 1 + milli/src/search/mod.rs | 18 ++++++++++++++++++ milli/src/search/new/bucket_sort.rs | 8 +++++++- milli/src/search/new/matches/mod.rs | 1 + milli/src/search/new/mod.rs | 14 +++++++++++++- 10 files changed, 77 insertions(+), 2 deletions(-) diff --git a/meilisearch/src/analytics/segment_analytics.rs b/meilisearch/src/analytics/segment_analytics.rs index aed29e612..ebd808b42 100644 --- a/meilisearch/src/analytics/segment_analytics.rs +++ b/meilisearch/src/analytics/segment_analytics.rs @@ -597,6 +597,9 @@ pub struct SearchAggregator { // every time a request has a filter, this field must be incremented by one sort_total_number_of_criteria: usize, + // distinct + distinct: bool, + // filter filter_with_geo_radius: bool, filter_with_geo_bounding_box: bool, @@ -670,6 +673,7 @@ impl SearchAggregator { show_ranking_score_details, filter, sort, + distinct, facets: _, highlight_pre_tag, highlight_post_tag, @@ -692,6 +696,8 @@ impl SearchAggregator { ret.sort_sum_of_criteria_terms = sort.len(); } + ret.distinct = distinct.is_some(); + if let Some(ref filter) = filter { static RE: Lazy = Lazy::new(|| Regex::new("AND | OR").unwrap()); ret.filter_total_number_of_criteria = 1; @@ -795,6 +801,7 @@ impl SearchAggregator { sort_with_geo_point, sort_sum_of_criteria_terms, sort_total_number_of_criteria, + distinct, filter_with_geo_radius, filter_with_geo_bounding_box, filter_sum_of_criteria_terms, @@ -851,6 +858,9 @@ impl SearchAggregator { self.sort_total_number_of_criteria = self.sort_total_number_of_criteria.saturating_add(sort_total_number_of_criteria); + // distinct + self.distinct |= distinct; + // filter self.filter_with_geo_radius |= filter_with_geo_radius; self.filter_with_geo_bounding_box |= filter_with_geo_bounding_box; @@ -921,6 +931,7 @@ impl SearchAggregator { sort_with_geo_point, sort_sum_of_criteria_terms, sort_total_number_of_criteria, + distinct, filter_with_geo_radius, filter_with_geo_bounding_box, filter_sum_of_criteria_terms, @@ -977,6 +988,8 @@ impl SearchAggregator { "with_geoPoint": sort_with_geo_point, "avg_criteria_number": format!("{:.2}", sort_sum_of_criteria_terms as f64 / sort_total_number_of_criteria as f64), }, + // TODO ask help from María + "distinct": distinct, "filter": { "with_geoRadius": filter_with_geo_radius, "with_geoBoundingBox": filter_with_geo_bounding_box, @@ -1087,6 +1100,7 @@ impl MultiSearchAggregator { show_matches_position: _, filter: _, sort: _, + distinct: _, facets: _, highlight_pre_tag: _, highlight_post_tag: _, diff --git a/meilisearch/src/routes/indexes/facet_search.rs b/meilisearch/src/routes/indexes/facet_search.rs index 10b371f2d..4b3f73115 100644 --- a/meilisearch/src/routes/indexes/facet_search.rs +++ b/meilisearch/src/routes/indexes/facet_search.rs @@ -123,6 +123,7 @@ impl From for SearchQuery { show_ranking_score_details: false, filter, sort: None, + distinct: None, facets: None, highlight_pre_tag: DEFAULT_HIGHLIGHT_PRE_TAG(), highlight_post_tag: DEFAULT_HIGHLIGHT_POST_TAG(), diff --git a/meilisearch/src/routes/indexes/search.rs b/meilisearch/src/routes/indexes/search.rs index 348d8295c..6ea6802d9 100644 --- a/meilisearch/src/routes/indexes/search.rs +++ b/meilisearch/src/routes/indexes/search.rs @@ -61,6 +61,9 @@ pub struct SearchQueryGet { filter: Option, #[deserr(default, error = DeserrQueryParamError)] sort: Option, + // TODO change the InvalidSearchSort to InvalidSearchDistinct error + #[deserr(default, error = DeserrQueryParamError)] + distinct: Option, #[deserr(default, error = DeserrQueryParamError)] show_matches_position: Param, #[deserr(default, error = DeserrQueryParamError)] @@ -158,6 +161,7 @@ impl From for SearchQuery { attributes_to_highlight: other.attributes_to_highlight.map(|o| o.into_iter().collect()), filter, sort: other.sort.map(|attr| fix_sort_query_parameters(&attr)), + distinct: other.distinct, show_matches_position: other.show_matches_position.0, show_ranking_score: other.show_ranking_score.0, show_ranking_score_details: other.show_ranking_score_details.0, diff --git a/meilisearch/src/search.rs b/meilisearch/src/search.rs index 05b3c1aff..edc3feb5d 100644 --- a/meilisearch/src/search.rs +++ b/meilisearch/src/search.rs @@ -75,6 +75,9 @@ pub struct SearchQuery { pub filter: Option, #[deserr(default, error = DeserrJsonError)] pub sort: Option>, + // TODO Change the error to InvalidSearchDistinct + #[deserr(default, error = DeserrJsonError)] + pub distinct: Option, #[deserr(default, error = DeserrJsonError)] pub facets: Option>, #[deserr(default, error = DeserrJsonError, default = DEFAULT_HIGHLIGHT_PRE_TAG())] @@ -149,6 +152,7 @@ impl fmt::Debug for SearchQuery { show_ranking_score_details, filter, sort, + distinct, facets, highlight_pre_tag, highlight_post_tag, @@ -195,6 +199,9 @@ impl fmt::Debug for SearchQuery { if let Some(sort) = sort { debug.field("sort", &sort); } + if let Some(distinct) = distinct { + debug.field("distinct", &distinct); + } if let Some(facets) = facets { debug.field("facets", &facets); } @@ -386,6 +393,9 @@ pub struct SearchQueryWithIndex { pub filter: Option, #[deserr(default, error = DeserrJsonError)] pub sort: Option>, + // TODO change error to InvalidSearchDistinct + #[deserr(default, error = DeserrJsonError)] + pub distinct: Option, #[deserr(default, error = DeserrJsonError)] pub facets: Option>, #[deserr(default, error = DeserrJsonError, default = DEFAULT_HIGHLIGHT_PRE_TAG())] @@ -421,6 +431,7 @@ impl SearchQueryWithIndex { show_matches_position, filter, sort, + distinct, facets, highlight_pre_tag, highlight_post_tag, @@ -448,6 +459,7 @@ impl SearchQueryWithIndex { show_matches_position, filter, sort, + distinct, facets, highlight_pre_tag, highlight_post_tag, @@ -716,6 +728,10 @@ fn prepare_search<'t>( search.ranking_score_threshold(ranking_score_threshold.0); } + if let Some(distinct) = &query.distinct { + search.distinct(distinct.clone()); + } + match search_kind { SearchKind::KeywordOnly => { if let Some(q) = &query.q { @@ -866,6 +882,7 @@ pub fn perform_search( matching_strategy: _, attributes_to_search_on: _, filter: _, + distinct: _, } = query; let format = AttributesFormat { diff --git a/milli/examples/search.rs b/milli/examples/search.rs index 0195c396f..87020994a 100644 --- a/milli/examples/search.rs +++ b/milli/examples/search.rs @@ -59,6 +59,7 @@ fn main() -> Result<(), Box> { false, universe, &None, + &None, GeoSortStrategy::default(), 0, 20, diff --git a/milli/src/search/hybrid.rs b/milli/src/search/hybrid.rs index 87f922c4c..1c784097d 100644 --- a/milli/src/search/hybrid.rs +++ b/milli/src/search/hybrid.rs @@ -159,6 +159,7 @@ impl<'a> Search<'a> { offset: 0, limit: self.limit + self.offset, sort_criteria: self.sort_criteria.clone(), + distinct: self.distinct.clone(), searchable_attributes: self.searchable_attributes, geo_strategy: self.geo_strategy, terms_matching_strategy: self.terms_matching_strategy, diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 49d73ff31..d937875da 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -40,6 +40,7 @@ pub struct Search<'a> { offset: usize, limit: usize, sort_criteria: Option>, + distinct: Option, searchable_attributes: Option<&'a [String]>, geo_strategy: new::GeoSortStrategy, terms_matching_strategy: TermsMatchingStrategy, @@ -61,6 +62,7 @@ impl<'a> Search<'a> { offset: 0, limit: 20, sort_criteria: None, + distinct: None, searchable_attributes: None, geo_strategy: new::GeoSortStrategy::default(), terms_matching_strategy: TermsMatchingStrategy::default(), @@ -105,6 +107,11 @@ impl<'a> Search<'a> { self } + pub fn distinct(&mut self, distinct: String) -> &mut Search<'a> { + self.distinct = Some(distinct); + self + } + pub fn searchable_attributes(&mut self, searchable: &'a [String]) -> &mut Search<'a> { self.searchable_attributes = Some(searchable); self @@ -169,6 +176,13 @@ impl<'a> Search<'a> { ctx.attributes_to_search_on(searchable_attributes)?; } + if let Some(distinct) = &self.distinct { + if !ctx.index.filterable_fields(ctx.txn)?.contains(distinct) { + // TODO return a real error message + panic!("Distinct search field is not a filterable attribute"); + } + } + let universe = filtered_universe(ctx.index, ctx.txn, &self.filter)?; let PartialSearchResult { located_query_terms, @@ -185,6 +199,7 @@ impl<'a> Search<'a> { self.scoring_strategy, universe, &self.sort_criteria, + &self.distinct, self.geo_strategy, self.offset, self.limit, @@ -202,6 +217,7 @@ impl<'a> Search<'a> { self.exhaustive_number_hits, universe, &self.sort_criteria, + &self.distinct, self.geo_strategy, self.offset, self.limit, @@ -238,6 +254,7 @@ impl fmt::Debug for Search<'_> { offset, limit, sort_criteria, + distinct, searchable_attributes, geo_strategy: _, terms_matching_strategy, @@ -257,6 +274,7 @@ impl fmt::Debug for Search<'_> { .field("offset", offset) .field("limit", limit) .field("sort_criteria", sort_criteria) + .field("distinct", distinct) .field("searchable_attributes", searchable_attributes) .field("terms_matching_strategy", terms_matching_strategy) .field("scoring_strategy", scoring_strategy) diff --git a/milli/src/search/new/bucket_sort.rs b/milli/src/search/new/bucket_sort.rs index d937c78bf..9255e4c09 100644 --- a/milli/src/search/new/bucket_sort.rs +++ b/milli/src/search/new/bucket_sort.rs @@ -22,6 +22,7 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( ctx: &mut SearchContext<'ctx>, mut ranking_rules: Vec>, query: &Q, + distinct: Option<&str>, universe: &RoaringBitmap, from: usize, length: usize, @@ -34,7 +35,12 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( logger.ranking_rules(&ranking_rules); logger.initial_universe(universe); - let distinct_fid = if let Some(field) = ctx.index.distinct_field(ctx.txn)? { + let distinct_field = match distinct { + Some(distinct) => Some(distinct), + None => ctx.index.distinct_field(ctx.txn)?, + }; + + let distinct_fid = if let Some(field) = distinct_field { ctx.index.fields_ids_map(ctx.txn)?.id(field) } else { None diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs index 87ddb2915..77ae5fcd5 100644 --- a/milli/src/search/new/matches/mod.rs +++ b/milli/src/search/new/matches/mod.rs @@ -516,6 +516,7 @@ mod tests { false, universe, &None, + &None, crate::search::new::GeoSortStrategy::default(), 0, 100, diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 623c72567..257f81539 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -567,6 +567,7 @@ pub fn execute_vector_search( scoring_strategy: ScoringStrategy, universe: RoaringBitmap, sort_criteria: &Option>, + distinct: &Option, geo_strategy: geo_sort::Strategy, from: usize, length: usize, @@ -597,6 +598,7 @@ pub fn execute_vector_search( ctx, ranking_rules, &PlaceholderQuery, + distinct.as_deref(), &universe, from, length, @@ -626,6 +628,7 @@ pub fn execute_search( exhaustive_number_hits: bool, mut universe: RoaringBitmap, sort_criteria: &Option>, + distinct: &Option, geo_strategy: geo_sort::Strategy, from: usize, length: usize, @@ -716,6 +719,7 @@ pub fn execute_search( ctx, ranking_rules, &graph, + distinct.as_deref(), &universe, from, length, @@ -731,6 +735,7 @@ pub fn execute_search( ctx, ranking_rules, &PlaceholderQuery, + distinct.as_deref(), &universe, from, length, @@ -747,7 +752,14 @@ pub fn execute_search( // The candidates is the universe unless the exhaustive number of hits // is requested and a distinct attribute is set. if exhaustive_number_hits { - if let Some(f) = ctx.index.distinct_field(ctx.txn)? { + // TODO Should the distinct search parameter replace the distinct setting? + // Or should we return an error if the distinct search param is set at the same time as the setting is set? + let distinct_field = match distinct.as_deref() { + Some(distinct) => Some(distinct), + None => ctx.index.distinct_field(ctx.txn)?, + }; + + if let Some(f) = distinct_field { if let Some(distinct_fid) = fields_ids_map.id(f) { all_candidates = apply_distinct_rule(ctx, distinct_fid, &all_candidates)?.remaining; } From ee39309aaeb1a595e9489351aac41223b61a1d79 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 11 Jun 2024 16:03:39 -0400 Subject: [PATCH 072/110] Improve errors and introduce a new InvalidSearchDistinct error code --- meilisearch-types/src/error.rs | 4 +++- meilisearch/src/routes/indexes/search.rs | 3 +-- meilisearch/src/search.rs | 6 ++---- milli/src/error.rs | 11 +++++++++++ milli/src/search/mod.rs | 16 +++++++++++----- 5 files changed, 28 insertions(+), 12 deletions(-) diff --git a/meilisearch-types/src/error.rs b/meilisearch-types/src/error.rs index 150c56b9d..1d91887e7 100644 --- a/meilisearch-types/src/error.rs +++ b/meilisearch-types/src/error.rs @@ -270,13 +270,14 @@ InvalidSimilarShowRankingScore , InvalidRequest , BAD_REQUEST ; InvalidSearchShowRankingScoreDetails , InvalidRequest , BAD_REQUEST ; InvalidSimilarShowRankingScoreDetails , InvalidRequest , BAD_REQUEST ; InvalidSearchSort , InvalidRequest , BAD_REQUEST ; +InvalidSearchDistinct , InvalidRequest , BAD_REQUEST ; InvalidSettingsDisplayedAttributes , InvalidRequest , BAD_REQUEST ; InvalidSettingsDistinctAttribute , InvalidRequest , BAD_REQUEST ; InvalidSettingsProximityPrecision , InvalidRequest , BAD_REQUEST ; InvalidSettingsFaceting , InvalidRequest , BAD_REQUEST ; InvalidSettingsFilterableAttributes , InvalidRequest , BAD_REQUEST ; InvalidSettingsPagination , InvalidRequest , BAD_REQUEST ; -InvalidSettingsSearchCutoffMs , InvalidRequest , BAD_REQUEST ; +InvalidSettingsSearchCutoffMs , InvalidRequest , BAD_REQUEST ; InvalidSettingsEmbedders , InvalidRequest , BAD_REQUEST ; InvalidSettingsRankingRules , InvalidRequest , BAD_REQUEST ; InvalidSettingsSearchableAttributes , InvalidRequest , BAD_REQUEST ; @@ -381,6 +382,7 @@ impl ErrorCode for milli::Error { Code::IndexPrimaryKeyMultipleCandidatesFound } UserError::PrimaryKeyCannotBeChanged(_) => Code::IndexPrimaryKeyAlreadyExists, + UserError::InvalidDistinctAttribute { .. } => Code::InvalidSearchDistinct, UserError::SortRankingRuleMissing => Code::InvalidSearchSort, UserError::InvalidFacetsDistribution { .. } => Code::InvalidSearchFacets, UserError::InvalidSortableAttribute { .. } => Code::InvalidSearchSort, diff --git a/meilisearch/src/routes/indexes/search.rs b/meilisearch/src/routes/indexes/search.rs index 6ea6802d9..cf179a234 100644 --- a/meilisearch/src/routes/indexes/search.rs +++ b/meilisearch/src/routes/indexes/search.rs @@ -61,8 +61,7 @@ pub struct SearchQueryGet { filter: Option, #[deserr(default, error = DeserrQueryParamError)] sort: Option, - // TODO change the InvalidSearchSort to InvalidSearchDistinct error - #[deserr(default, error = DeserrQueryParamError)] + #[deserr(default, error = DeserrQueryParamError)] distinct: Option, #[deserr(default, error = DeserrQueryParamError)] show_matches_position: Param, diff --git a/meilisearch/src/search.rs b/meilisearch/src/search.rs index edc3feb5d..522577cde 100644 --- a/meilisearch/src/search.rs +++ b/meilisearch/src/search.rs @@ -75,8 +75,7 @@ pub struct SearchQuery { pub filter: Option, #[deserr(default, error = DeserrJsonError)] pub sort: Option>, - // TODO Change the error to InvalidSearchDistinct - #[deserr(default, error = DeserrJsonError)] + #[deserr(default, error = DeserrJsonError)] pub distinct: Option, #[deserr(default, error = DeserrJsonError)] pub facets: Option>, @@ -393,8 +392,7 @@ pub struct SearchQueryWithIndex { pub filter: Option, #[deserr(default, error = DeserrJsonError)] pub sort: Option>, - // TODO change error to InvalidSearchDistinct - #[deserr(default, error = DeserrJsonError)] + #[deserr(default, error = DeserrJsonError)] pub distinct: Option, #[deserr(default, error = DeserrJsonError)] pub facets: Option>, diff --git a/milli/src/error.rs b/milli/src/error.rs index 83754afe4..7420ce667 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -134,6 +134,17 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco } )] InvalidSortableAttribute { field: String, valid_fields: BTreeSet, hidden_fields: bool }, + #[error("Attribute `{}` is not filterable and thus, cannot be used as distinct attribute. {}", + .field, + match .valid_fields.is_empty() { + true => "This index does not have configured filterable attributes.".to_string(), + false => format!("Available filterable attributes are: `{}{}`.", + valid_fields.iter().map(AsRef::as_ref).collect::>().join(", "), + .hidden_fields.then_some(", <..hidden-attributes>").unwrap_or(""), + ), + } + )] + InvalidDistinctAttribute { field: String, valid_fields: BTreeSet, hidden_fields: bool }, #[error("Attribute `{}` is not facet-searchable. {}", .field, match .valid_fields.is_empty() { diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index d937875da..922b72d04 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -11,8 +11,8 @@ use self::new::{execute_vector_search, PartialSearchResult}; use crate::score_details::{ScoreDetails, ScoringStrategy}; use crate::vector::Embedder; use crate::{ - execute_search, filtered_universe, AscDesc, DefaultSearchLogger, DocumentId, Index, Result, - SearchContext, TimeBudget, + execute_search, filtered_universe, AscDesc, DefaultSearchLogger, DocumentId, Error, Index, + Result, SearchContext, TimeBudget, UserError, }; // Building these factories is not free. @@ -177,9 +177,15 @@ impl<'a> Search<'a> { } if let Some(distinct) = &self.distinct { - if !ctx.index.filterable_fields(ctx.txn)?.contains(distinct) { - // TODO return a real error message - panic!("Distinct search field is not a filterable attribute"); + let filterable_fields = ctx.index.filterable_fields(ctx.txn)?; + if !filterable_fields.contains(distinct) { + let (valid_fields, hidden_fields) = + ctx.index.remove_hidden_fields(ctx.txn, filterable_fields)?; + return Err(Error::UserError(UserError::InvalidDistinctAttribute { + field: distinct.clone(), + valid_fields, + hidden_fields, + })); } } From 1991bd03daf30d29b612ff613a463ad94b98d6f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 11 Jun 2024 17:02:39 -0400 Subject: [PATCH 073/110] Distinct at search erases the distinct in the settings --- milli/src/search/new/mod.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 257f81539..5921e27eb 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -752,8 +752,6 @@ pub fn execute_search( // The candidates is the universe unless the exhaustive number of hits // is requested and a distinct attribute is set. if exhaustive_number_hits { - // TODO Should the distinct search parameter replace the distinct setting? - // Or should we return an error if the distinct search param is set at the same time as the setting is set? let distinct_field = match distinct.as_deref() { Some(distinct) => Some(distinct), None => ctx.index.distinct_field(ctx.txn)?, From 39f60abd7d02d5e6207fb653835f54ac0e37cdd2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 11 Jun 2024 17:53:53 -0400 Subject: [PATCH 074/110] Add and modify distinct tests --- milli/src/search/new/tests/distinct.rs | 65 ++++++++++++++++++++++---- 1 file changed, 56 insertions(+), 9 deletions(-) diff --git a/milli/src/search/new/tests/distinct.rs b/milli/src/search/new/tests/distinct.rs index c54600f27..75c00da2a 100644 --- a/milli/src/search/new/tests/distinct.rs +++ b/milli/src/search/new/tests/distinct.rs @@ -205,8 +205,18 @@ fn create_index() -> TempIndex { index } -fn verify_distinct(index: &Index, txn: &RoTxn, docids: &[u32]) -> Vec { - let vs = collect_field_values(index, txn, index.distinct_field(txn).unwrap().unwrap(), docids); +fn verify_distinct( + index: &Index, + txn: &RoTxn, + distinct: Option<&str>, + docids: &[u32], +) -> Vec { + let vs = collect_field_values( + index, + txn, + distinct.or_else(|| index.distinct_field(txn).unwrap()).unwrap(), + docids, + ); let mut unique = HashSet::new(); for v in vs.iter() { @@ -223,12 +233,49 @@ fn verify_distinct(index: &Index, txn: &RoTxn, docids: &[u32]) -> Vec { fn test_distinct_placeholder_no_ranking_rules() { let index = create_index(); + // Set the letter as filterable and unset the distinct attribute. + index + .update_settings(|s| { + s.set_filterable_fields(hashset! { S("letter") }); + s.reset_distinct_field(); + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.distinct(S("letter")); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 2, 5, 8, 9, 15, 18, 20, 21, 24, 25, 26]"); + let distinct_values = verify_distinct(&index, &txn, Some("letter"), &documents_ids); + insta::assert_debug_snapshot!(distinct_values, @r###" + [ + "\"A\"", + "\"B\"", + "\"C\"", + "\"D\"", + "\"E\"", + "\"F\"", + "\"G\"", + "\"H\"", + "\"I\"", + "__does_not_exist__", + "__does_not_exist__", + "__does_not_exist__", + ] + "###); +} + +#[test] +fn test_distinct_at_search_placeholder_no_ranking_rules() { + let index = create_index(); + let txn = index.read_txn().unwrap(); let s = Search::new(&txn, &index); let SearchResult { documents_ids, .. } = s.execute().unwrap(); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 2, 5, 8, 9, 15, 18, 20, 21, 24, 25, 26]"); - let distinct_values = verify_distinct(&index, &txn, &documents_ids); + let distinct_values = verify_distinct(&index, &txn, None, &documents_ids); insta::assert_debug_snapshot!(distinct_values, @r###" [ "\"A\"", @@ -263,7 +310,7 @@ fn test_distinct_placeholder_sort() { let SearchResult { documents_ids, .. } = s.execute().unwrap(); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[14, 26, 4, 7, 17, 23, 1, 19, 25, 8, 20, 24]"); - let distinct_values = verify_distinct(&index, &txn, &documents_ids); + let distinct_values = verify_distinct(&index, &txn, None, &documents_ids); insta::assert_debug_snapshot!(distinct_values, @r###" [ "\"E\"", @@ -303,7 +350,7 @@ fn test_distinct_placeholder_sort() { let SearchResult { documents_ids, .. } = s.execute().unwrap(); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[21, 20, 18, 15, 9, 8, 5, 2, 0, 24, 25, 26]"); - let distinct_values = verify_distinct(&index, &txn, &documents_ids); + let distinct_values = verify_distinct(&index, &txn, None, &documents_ids); insta::assert_debug_snapshot!(distinct_values, @r###" [ "\"I\"", @@ -346,7 +393,7 @@ fn test_distinct_placeholder_sort() { let SearchResult { documents_ids, .. } = s.execute().unwrap(); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[23, 20, 19, 17, 14, 8, 7, 4, 1, 26, 25, 24]"); - let distinct_values = verify_distinct(&index, &txn, &documents_ids); + let distinct_values = verify_distinct(&index, &txn, None, &documents_ids); insta::assert_debug_snapshot!(distinct_values, @r###" [ "\"I\"", @@ -399,7 +446,7 @@ fn test_distinct_words() { let SearchResult { documents_ids, .. } = s.execute().unwrap(); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 2, 26, 5, 8, 9, 15, 18, 20, 21, 25, 24]"); - let distinct_values = verify_distinct(&index, &txn, &documents_ids); + let distinct_values = verify_distinct(&index, &txn, None, &documents_ids); insta::assert_debug_snapshot!(distinct_values, @r###" [ "\"A\"", @@ -453,7 +500,7 @@ fn test_distinct_sort_words() { let SearchResult { documents_ids, .. } = s.execute().unwrap(); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[22, 20, 19, 16, 9, 8, 7, 3, 1, 26, 25, 24]"); - let distinct_values = verify_distinct(&index, &txn, &documents_ids); + let distinct_values = verify_distinct(&index, &txn, None, &documents_ids); insta::assert_debug_snapshot!(distinct_values, @r###" [ "\"I\"", @@ -549,7 +596,7 @@ fn test_distinct_typo() { let SearchResult { documents_ids, .. } = s.execute().unwrap(); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[3, 26, 0, 7, 8, 9, 15, 22, 18, 20, 25, 24]"); - let distinct_values = verify_distinct(&index, &txn, &documents_ids); + let distinct_values = verify_distinct(&index, &txn, None, &documents_ids); insta::assert_debug_snapshot!(distinct_values, @r###" [ "\"B\"", From 304a9df52da8ef0641f1b1fc8d713264725f9929 Mon Sep 17 00:00:00 2001 From: Many the fish Date: Wed, 12 Jun 2024 07:22:24 +0200 Subject: [PATCH 075/110] Remove `-v` parameter --- .github/workflows/test-suite.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-suite.yml b/.github/workflows/test-suite.yml index 5dbde4301..84a82250e 100644 --- a/.github/workflows/test-suite.yml +++ b/.github/workflows/test-suite.yml @@ -116,7 +116,7 @@ jobs: override: true - name: Run cargo tree without default features and check lindera is not present run: | - if cargo tree -f '{p} {f}' -e normal --no-default-features | grep -vqz lindera; then + if cargo tree -f '{p} {f}' -e normal --no-default-features | grep -qz lindera; then echo "lindera has been found in the sources and it shouldn't" exit 1 fi From b368105272a926b85e4848dc86d75c5b25edaf8d Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 12 Jun 2024 14:02:12 +0200 Subject: [PATCH 076/110] Add EmbedderConfigs::into_inner --- milli/src/vector/mod.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/milli/src/vector/mod.rs b/milli/src/vector/mod.rs index 553c8c3c1..c43fa8bd2 100644 --- a/milli/src/vector/mod.rs +++ b/milli/src/vector/mod.rs @@ -152,6 +152,10 @@ impl EmbeddingConfigs { &self.0 } + pub fn into_inner(self) -> HashMap, Arc)> { + self.0 + } + /// Get the name of the default embedder configuration. /// /// The default embedder is determined as follows: From e9bf4eb10056ed96bfc1964717d44fd7c54e4487 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 12 Jun 2024 14:02:49 +0200 Subject: [PATCH 077/110] Reformulate ParsedVectorsDiff in terms of VectorState --- milli/src/vector/parsed_vectors.rs | 78 ++++++++++++++++++++++++------ 1 file changed, 64 insertions(+), 14 deletions(-) diff --git a/milli/src/vector/parsed_vectors.rs b/milli/src/vector/parsed_vectors.rs index 501bd2ad2..9007e03e4 100644 --- a/milli/src/vector/parsed_vectors.rs +++ b/milli/src/vector/parsed_vectors.rs @@ -42,9 +42,31 @@ pub struct ExplicitVectors { pub user_provided: bool, } +pub enum VectorState { + Inline(Vectors), + InDb, + Generated, +} + +impl VectorState { + pub fn is_user_provided(&self) -> bool { + match self { + VectorState::Inline(vectors) => vectors.is_user_provided(), + VectorState::InDb => true, + VectorState::Generated => false, + } + } +} + +pub enum VectorsState { + NoVectorsFid, + NoVectorsFieldInDocument, + Vectors(BTreeMap), +} + pub struct ParsedVectorsDiff { - pub old: BTreeMap>, - pub new: Option>, + old: BTreeMap, + new: VectorsState, } impl ParsedVectorsDiff { @@ -71,26 +93,54 @@ impl ParsedVectorsDiff { return Err(error); } } - .flatten().map_or(BTreeMap::default(), |del| del.into_iter().map(|(name, vec)| (name, Some(vec))).collect()); + .flatten().map_or(BTreeMap::default(), |del| del.into_iter().map(|(name, vec)| (name, VectorState::Inline(vec))).collect()); for embedding_config in embedders_configs { if embedding_config.user_provided.contains(docid) { - old.entry(embedding_config.name.to_string()).or_insert(None); + old.entry(embedding_config.name.to_string()).or_insert(VectorState::InDb); } } - let new = new_vectors_fid - .and_then(|vectors_fid| documents_diff.get(vectors_fid)) - .map(KvReaderDelAdd::new) - .map(|obkv| to_vector_map(obkv, DelAdd::Addition)) - .transpose()? - .flatten(); + let new = 'new: { + let Some(new_vectors_fid) = new_vectors_fid else { + break 'new VectorsState::NoVectorsFid; + }; + let Some(bytes) = documents_diff.get(new_vectors_fid) else { + break 'new VectorsState::NoVectorsFieldInDocument; + }; + let obkv = KvReaderDelAdd::new(bytes); + match to_vector_map(obkv, DelAdd::Addition)? { + Some(new) => VectorsState::Vectors(new), + None => VectorsState::NoVectorsFieldInDocument, + } + }; + Ok(Self { old, new }) } - /// Return (Some(None), _) in case the vector is user defined and contained in the database. - pub fn remove(&mut self, embedder_name: &str) -> (Option>, Option) { - let old = self.old.remove(embedder_name); - let new = self.new.as_mut().and_then(|new| new.remove(embedder_name)); + pub fn remove(&mut self, embedder_name: &str) -> (VectorState, VectorState) { + let old = self.old.remove(embedder_name).unwrap_or(VectorState::Generated); + let state_from_old = match old { + // assume a userProvided is still userProvided + VectorState::InDb => VectorState::InDb, + // generated is still generated + VectorState::Generated => VectorState::Generated, + // weird case that shouldn't happen were the previous docs version is inline, + // but it was removed in the new version + // Since it is not in the new version, we switch to generated + VectorState::Inline(_) => VectorState::Generated, + }; + let new = match &mut self.new { + VectorsState::Vectors(new) => { + new.remove(embedder_name).map(VectorState::Inline).unwrap_or(state_from_old) + } + _ => + // if no `_vectors` field is present in the new document, + // the state depends on the previous version of the document + { + state_from_old + } + }; + (old, new) } } From d0b05ae691681b7e490b4303bbd56daaf71a0845 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 12 Jun 2024 14:03:40 +0200 Subject: [PATCH 078/110] Add EmbedderAction to settings --- milli/src/vector/settings.rs | 300 ++++++++++++++++++++++++++++------- 1 file changed, 240 insertions(+), 60 deletions(-) diff --git a/milli/src/vector/settings.rs b/milli/src/vector/settings.rs index e786a7164..edbed462c 100644 --- a/milli/src/vector/settings.rs +++ b/milli/src/vector/settings.rs @@ -1,4 +1,5 @@ use deserr::Deserr; +use roaring::RoaringBitmap; use serde::{Deserialize, Serialize}; use super::rest::InputType; @@ -72,6 +73,245 @@ pub fn check_unset( } } +/// Indicates what action should take place during a reindexing operation for an embedder +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub enum ReindexAction { + /// An indexing operation should take place for this embedder, keeping existing vectors + /// and checking whether the document template changed or not + RegeneratePrompts, + /// An indexing operation should take place for all documents for this embedder, removing existing vectors + /// (except userProvided ones) + FullReindex, +} + +pub enum SettingsDiff { + Remove, + Reindex { action: ReindexAction, updated_settings: EmbeddingSettings }, + UpdateWithoutReindex { updated_settings: EmbeddingSettings }, +} + +pub enum EmbedderAction { + WriteBackToDocuments(WriteBackToDocuments), + Reindex(ReindexAction), +} + +pub struct WriteBackToDocuments { + pub embedder_id: u8, + pub user_provided: RoaringBitmap, +} + +impl SettingsDiff { + pub fn should_reindex(&self) -> bool { + match self { + SettingsDiff::Remove { .. } | SettingsDiff::Reindex { .. } => true, + SettingsDiff::UpdateWithoutReindex { .. } => false, + } + } + + pub fn from_settings(old: EmbeddingSettings, new: Setting) -> Self { + match new { + Setting::Set(new) => { + let EmbeddingSettings { + mut source, + mut model, + mut revision, + mut api_key, + mut dimensions, + mut document_template, + mut url, + mut query, + mut input_field, + mut path_to_embeddings, + mut embedding_object, + mut input_type, + mut distribution, + } = old; + + let EmbeddingSettings { + source: new_source, + model: new_model, + revision: new_revision, + api_key: new_api_key, + dimensions: new_dimensions, + document_template: new_document_template, + url: new_url, + query: new_query, + input_field: new_input_field, + path_to_embeddings: new_path_to_embeddings, + embedding_object: new_embedding_object, + input_type: new_input_type, + distribution: new_distribution, + } = new; + + let mut reindex_action = None; + + // **Warning**: do not use short-circuiting || here, we want all these operations applied + if source.apply(new_source) { + ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex); + // when the source changes, we need to reapply the default settings for the new source + apply_default_for_source( + &source, + &mut model, + &mut revision, + &mut dimensions, + &mut url, + &mut query, + &mut input_field, + &mut path_to_embeddings, + &mut embedding_object, + &mut input_type, + &mut document_template, + ) + } + if model.apply(new_model) { + ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex); + } + if revision.apply(new_revision) { + ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex); + } + if dimensions.apply(new_dimensions) { + ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex); + } + if url.apply(new_url) { + ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex); + } + if query.apply(new_query) { + ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex); + } + if input_field.apply(new_input_field) { + ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex); + } + if path_to_embeddings.apply(new_path_to_embeddings) { + ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex); + } + if embedding_object.apply(new_embedding_object) { + ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex); + } + if input_type.apply(new_input_type) { + ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex); + } + if document_template.apply(new_document_template) { + ReindexAction::push_action( + &mut reindex_action, + ReindexAction::RegeneratePrompts, + ); + } + + distribution.apply(new_distribution); + api_key.apply(new_api_key); + + let updated_settings = EmbeddingSettings { + source, + model, + revision, + api_key, + dimensions, + document_template, + url, + query, + input_field, + path_to_embeddings, + embedding_object, + input_type, + distribution, + }; + + match reindex_action { + Some(action) => Self::Reindex { action, updated_settings }, + None => Self::UpdateWithoutReindex { updated_settings }, + } + } + Setting::Reset => Self::Remove, + Setting::NotSet => Self::UpdateWithoutReindex { updated_settings: old }, + } + } +} + +impl ReindexAction { + fn push_action(this: &mut Option, other: Self) { + *this = match (*this, other) { + (_, ReindexAction::FullReindex) => Some(ReindexAction::FullReindex), + (Some(ReindexAction::FullReindex), _) => Some(ReindexAction::FullReindex), + (_, ReindexAction::RegeneratePrompts) => Some(ReindexAction::RegeneratePrompts), + } + } +} + +#[allow(clippy::too_many_arguments)] // private function +fn apply_default_for_source( + source: &Setting, + model: &mut Setting, + revision: &mut Setting, + dimensions: &mut Setting, + url: &mut Setting, + query: &mut Setting, + input_field: &mut Setting>, + path_to_embeddings: &mut Setting>, + embedding_object: &mut Setting>, + input_type: &mut Setting, + document_template: &mut Setting, +) { + match source { + Setting::Set(EmbedderSource::HuggingFace) => { + *model = Setting::Reset; + *revision = Setting::Reset; + *dimensions = Setting::NotSet; + *url = Setting::NotSet; + *query = Setting::NotSet; + *input_field = Setting::NotSet; + *path_to_embeddings = Setting::NotSet; + *embedding_object = Setting::NotSet; + *input_type = Setting::NotSet; + } + Setting::Set(EmbedderSource::Ollama) => { + *model = Setting::Reset; + *revision = Setting::NotSet; + *dimensions = Setting::Reset; + *url = Setting::NotSet; + *query = Setting::NotSet; + *input_field = Setting::NotSet; + *path_to_embeddings = Setting::NotSet; + *embedding_object = Setting::NotSet; + *input_type = Setting::NotSet; + } + Setting::Set(EmbedderSource::OpenAi) | Setting::Reset => { + *model = Setting::Reset; + *revision = Setting::NotSet; + *dimensions = Setting::NotSet; + *url = Setting::NotSet; + *query = Setting::NotSet; + *input_field = Setting::NotSet; + *path_to_embeddings = Setting::NotSet; + *embedding_object = Setting::NotSet; + *input_type = Setting::NotSet; + } + Setting::Set(EmbedderSource::Rest) => { + *model = Setting::NotSet; + *revision = Setting::NotSet; + *dimensions = Setting::Reset; + *url = Setting::Reset; + *query = Setting::Reset; + *input_field = Setting::Reset; + *path_to_embeddings = Setting::Reset; + *embedding_object = Setting::Reset; + *input_type = Setting::Reset; + } + Setting::Set(EmbedderSource::UserProvided) => { + *model = Setting::NotSet; + *revision = Setting::NotSet; + *dimensions = Setting::Reset; + *url = Setting::NotSet; + *query = Setting::NotSet; + *input_field = Setting::NotSet; + *path_to_embeddings = Setting::NotSet; + *embedding_object = Setting::NotSet; + *input_type = Setting::NotSet; + *document_template = Setting::NotSet; + } + Setting::NotSet => {} + } +} + pub fn check_set( key: &Setting, field: &'static str, @@ -210,66 +450,6 @@ impl EmbeddingSettings { *model = Setting::Set(openai::EmbeddingModel::default().name().to_owned()) } } - - pub(crate) fn apply_and_need_reindex( - old: &mut Setting, - new: Setting, - ) -> bool { - match (old, new) { - ( - Setting::Set(EmbeddingSettings { - source: old_source, - model: old_model, - revision: old_revision, - api_key: old_api_key, - dimensions: old_dimensions, - document_template: old_document_template, - url: old_url, - query: old_query, - input_field: old_input_field, - path_to_embeddings: old_path_to_embeddings, - embedding_object: old_embedding_object, - input_type: old_input_type, - distribution: old_distribution, - }), - Setting::Set(EmbeddingSettings { - source: new_source, - model: new_model, - revision: new_revision, - api_key: new_api_key, - dimensions: new_dimensions, - document_template: new_document_template, - url: new_url, - query: new_query, - input_field: new_input_field, - path_to_embeddings: new_path_to_embeddings, - embedding_object: new_embedding_object, - input_type: new_input_type, - distribution: new_distribution, - }), - ) => { - let mut needs_reindex = false; - - needs_reindex |= old_source.apply(new_source); - needs_reindex |= old_model.apply(new_model); - needs_reindex |= old_revision.apply(new_revision); - needs_reindex |= old_dimensions.apply(new_dimensions); - needs_reindex |= old_document_template.apply(new_document_template); - needs_reindex |= old_url.apply(new_url); - needs_reindex |= old_query.apply(new_query); - needs_reindex |= old_input_field.apply(new_input_field); - needs_reindex |= old_path_to_embeddings.apply(new_path_to_embeddings); - needs_reindex |= old_embedding_object.apply(new_embedding_object); - needs_reindex |= old_input_type.apply(new_input_type); - - old_distribution.apply(new_distribution); - old_api_key.apply(new_api_key); - needs_reindex - } - (Setting::Reset, Setting::Reset) | (_, Setting::NotSet) => false, - _ => true, - } - } } #[derive(Debug, Clone, Copy, Default, Serialize, Deserialize, PartialEq, Eq, Deserr)] From d18c1f77d7453b2842851a947621cbe5687fdbb5 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 12 Jun 2024 14:04:54 +0200 Subject: [PATCH 079/110] Update embedder configs with a finer granularity - no longer clear vector DB between any two embedder changes --- milli/src/update/settings.rs | 278 +++++++++++++++++++++-------------- 1 file changed, 171 insertions(+), 107 deletions(-) diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 08b12d178..5421b64a7 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -23,7 +23,10 @@ use crate::proximity::ProximityPrecision; use crate::update::index_documents::IndexDocumentsMethod; use crate::update::{IndexDocuments, UpdateIndexingStep}; use crate::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME; -use crate::vector::settings::{check_set, check_unset, EmbedderSource, EmbeddingSettings}; +use crate::vector::settings::{ + check_set, check_unset, EmbedderAction, EmbedderSource, EmbeddingSettings, ReindexAction, + WriteBackToDocuments, +}; use crate::vector::{Embedder, EmbeddingConfig, EmbeddingConfigs}; use crate::{FieldId, FieldsIdsMap, Index, Result}; @@ -924,111 +927,177 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { Ok(changed) } - fn update_embedding_configs(&mut self) -> Result { - let update = match std::mem::take(&mut self.embedder_settings) { - Setting::Set(configs) => { - let mut changed = false; + fn update_embedding_configs(&mut self) -> Result> { + match std::mem::take(&mut self.embedder_settings) { + Setting::Set(configs) => self.update_embedding_configs_set(configs), + Setting::Reset => { + // all vectors should be written back to documents let old_configs = self.index.embedding_configs(self.wtxn)?; - let old_configs: BTreeMap, RoaringBitmap)> = - old_configs - .into_iter() - .map( - |IndexEmbeddingConfig { name, config, user_provided: user_defined }| { - (name, (Setting::Set(config.into()), user_defined)) - }, - ) - .collect(); - - let mut new_configs = BTreeMap::new(); - for joined in old_configs + let remove_all: Result> = old_configs .into_iter() - .merge_join_by(configs.into_iter(), |(left, _), (right, _)| left.cmp(right)) - { - match joined { - // updated config - EitherOrBoth::Both((name, (mut old, user_provided)), (_, new)) => { - changed |= EmbeddingSettings::apply_and_need_reindex(&mut old, new); - if changed { - tracing::debug!( - embedder = name, - user_provided = user_provided.len(), - "need reindex" - ); - } else { - tracing::debug!(embedder = name, "skip reindex"); - } - let new = validate_embedding_settings(old, &name)?; - new_configs.insert(name, (new, user_provided)); - } - // unchanged config - EitherOrBoth::Left((name, setting)) => { - new_configs.insert(name, setting); - } - // new config - EitherOrBoth::Right((name, mut setting)) => { - // apply the default source in case the source was not set so that it gets validated - crate::vector::settings::EmbeddingSettings::apply_default_source( - &mut setting, - ); - crate::vector::settings::EmbeddingSettings::apply_default_openai_model( - &mut setting, - ); - let setting = validate_embedding_settings(setting, &name)?; - changed = true; - new_configs.insert(name, (setting, RoaringBitmap::new())); - } - } - } - let new_configs: Vec = new_configs - .into_iter() - .filter_map(|(name, (config, user_provided))| match config { - Setting::Set(config) => Some(IndexEmbeddingConfig { + .map(|IndexEmbeddingConfig { name, config: _, user_provided }| -> Result<_> { + let embedder_id = + self.index.embedder_category_id.get(self.wtxn, &name)?.ok_or( + crate::InternalError::DatabaseMissingEntry { + db_name: crate::index::db_name::VECTOR_EMBEDDER_CATEGORY_ID, + key: None, + }, + )?; + Ok(( name, - config: config.into(), - user_provided, - }), - Setting::Reset => None, - Setting::NotSet => Some(IndexEmbeddingConfig { - name, - config: EmbeddingSettings::default().into(), - user_provided, - }), + EmbedderAction::WriteBackToDocuments(WriteBackToDocuments { + embedder_id, + user_provided, + }), + )) }) .collect(); + let remove_all = remove_all?; + self.index.embedder_category_id.clear(self.wtxn)?; - for (index, index_embedding_config) in new_configs.iter().enumerate() { - self.index.embedder_category_id.put_with_flags( - self.wtxn, - heed::PutFlags::APPEND, - &index_embedding_config.name, - &index - .try_into() - .map_err(|_| UserError::TooManyEmbedders(new_configs.len()))?, - )?; - } - - if new_configs.is_empty() { - self.index.delete_embedding_configs(self.wtxn)?; - } else { - self.index.put_embedding_configs(self.wtxn, new_configs)?; - } - changed - } - Setting::Reset => { self.index.delete_embedding_configs(self.wtxn)?; - true + Ok(remove_all) } - Setting::NotSet => false, - }; - - // if any changes force a reindexing - // clear the vector database. - if update { - self.index.vector_arroy.clear(self.wtxn)?; + Setting::NotSet => Ok(Default::default()), } + } - Ok(update) + fn update_embedding_configs_set( + &mut self, + configs: BTreeMap>, + ) -> Result> { + use crate::vector::settings::SettingsDiff; + + let old_configs = self.index.embedding_configs(self.wtxn)?; + let old_configs: BTreeMap = old_configs + .into_iter() + .map(|IndexEmbeddingConfig { name, config, user_provided }| { + (name, (config.into(), user_provided)) + }) + .collect(); + let mut updated_configs = BTreeMap::new(); + let mut embedder_actions = BTreeMap::new(); + for joined in old_configs + .into_iter() + .merge_join_by(configs.into_iter(), |(left, _), (right, _)| left.cmp(right)) + { + match joined { + // updated config + EitherOrBoth::Both((name, (old, user_provided)), (_, new)) => { + let settings_diff = SettingsDiff::from_settings(old, new); + match settings_diff { + SettingsDiff::Remove => { + tracing::debug!( + embedder = name, + user_provided = user_provided.len(), + "removing embedder" + ); + let embedder_id = + self.index.embedder_category_id.get(self.wtxn, &name)?.ok_or( + crate::InternalError::DatabaseMissingEntry { + db_name: crate::index::db_name::VECTOR_EMBEDDER_CATEGORY_ID, + key: None, + }, + )?; + // free id immediately + self.index.embedder_category_id.delete(self.wtxn, &name)?; + embedder_actions.insert( + name, + EmbedderAction::WriteBackToDocuments(WriteBackToDocuments { + embedder_id, + user_provided, + }), + ); + } + SettingsDiff::Reindex { action, updated_settings } => { + tracing::debug!( + embedder = name, + user_provided = user_provided.len(), + ?action, + "reindex embedder" + ); + embedder_actions.insert(name.clone(), EmbedderAction::Reindex(action)); + let new = + validate_embedding_settings(Setting::Set(updated_settings), &name)?; + updated_configs.insert(name, (new, user_provided)); + } + SettingsDiff::UpdateWithoutReindex { updated_settings } => { + tracing::debug!( + embedder = name, + user_provided = user_provided.len(), + "update without reindex embedder" + ); + let new = + validate_embedding_settings(Setting::Set(updated_settings), &name)?; + updated_configs.insert(name, (new, user_provided)); + } + } + } + // unchanged config + EitherOrBoth::Left((name, (setting, user_provided))) => { + tracing::debug!(embedder = name, "unchanged embedder"); + updated_configs.insert(name, (Setting::Set(setting), user_provided)); + } + // new config + EitherOrBoth::Right((name, mut setting)) => { + tracing::debug!(embedder = name, "new embedder"); + // apply the default source in case the source was not set so that it gets validated + crate::vector::settings::EmbeddingSettings::apply_default_source(&mut setting); + crate::vector::settings::EmbeddingSettings::apply_default_openai_model( + &mut setting, + ); + let setting = validate_embedding_settings(setting, &name)?; + embedder_actions + .insert(name.clone(), EmbedderAction::Reindex(ReindexAction::FullReindex)); + updated_configs.insert(name, (setting, RoaringBitmap::new())); + } + } + } + let mut free_indices: [bool; u8::MAX as usize] = [true; u8::MAX as usize]; + for res in self.index.embedder_category_id.iter(self.wtxn)? { + let (_name, id) = res?; + free_indices[id as usize] = false; + } + let mut free_indices = free_indices.iter_mut().enumerate(); + let mut find_free_index = + move || free_indices.find(|(_, free)| **free).map(|(index, _)| index as u8); + for (name, action) in embedder_actions.iter() { + match action { + EmbedderAction::Reindex(ReindexAction::RegeneratePrompts) => { + /* cannot be a new embedder, so has to have an id already */ + } + EmbedderAction::Reindex(ReindexAction::FullReindex) => { + if self.index.embedder_category_id.get(self.wtxn, name)?.is_none() { + let id = find_free_index() + .ok_or(UserError::TooManyEmbedders(updated_configs.len()))?; + tracing::debug!(embedder = name, id, "assigning free id to new embedder"); + self.index.embedder_category_id.put(self.wtxn, name, &id)?; + } + } + EmbedderAction::WriteBackToDocuments(_) => { /* already removed */ } + } + } + let updated_configs: Vec = updated_configs + .into_iter() + .filter_map(|(name, (config, user_provided))| match config { + Setting::Set(config) => { + Some(IndexEmbeddingConfig { name, config: config.into(), user_provided }) + } + Setting::Reset => None, + Setting::NotSet => Some(IndexEmbeddingConfig { + name, + config: EmbeddingSettings::default().into(), + user_provided, + }), + }) + .collect(); + if updated_configs.is_empty() { + self.index.delete_embedding_configs(self.wtxn)?; + } else { + self.index.put_embedding_configs(self.wtxn, updated_configs)?; + } + Ok(embedder_actions) } fn update_search_cutoff(&mut self) -> Result { @@ -1082,13 +1151,8 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { self.update_searchable()?; self.update_exact_attributes()?; self.update_proximity_precision()?; - // TODO: very rough approximation of the needs for reindexing where any change will result in - // a full reindexing. - // What can be done instead: - // 1. Only change the distance on a distance change - // 2. Only change the name -> embedder mapping on a name change - // 3. Keep the old vectors but reattempt indexing on a prompt change: only actually changed prompt will need embedding + storage - let embedding_configs_updated = self.update_embedding_configs()?; + + let embedding_config_updates = self.update_embedding_configs()?; let mut new_inner_settings = InnerIndexSettings::from_index(self.index, self.wtxn)?; new_inner_settings.recompute_facets(self.wtxn, self.index)?; @@ -1102,7 +1166,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { old_inner_settings, new_inner_settings, primary_key_id, - embedding_configs_updated, + embedding_config_updates, settings_update_only, ); @@ -1119,7 +1183,7 @@ pub struct InnerIndexSettingsDiff { pub(crate) new: InnerIndexSettings, pub(crate) primary_key_id: Option, // TODO: compare directly the embedders. - pub(crate) embedding_configs_updated: bool, + pub(crate) embedding_config_updates: BTreeMap, pub(crate) settings_update_only: bool, /// The set of only the additional searchable fields. /// If any other searchable field has been modified, is set to None. @@ -1140,7 +1204,7 @@ impl InnerIndexSettingsDiff { old_settings: InnerIndexSettings, new_settings: InnerIndexSettings, primary_key_id: Option, - embedding_configs_updated: bool, + embedding_config_updates: BTreeMap, settings_update_only: bool, ) -> Self { let only_additional_fields = match ( @@ -1177,7 +1241,7 @@ impl InnerIndexSettingsDiff { old: old_settings, new: new_settings, primary_key_id, - embedding_configs_updated, + embedding_config_updates, settings_update_only, only_additional_fields, cache_reindex_searchable_without_user_defined, @@ -1244,7 +1308,7 @@ impl InnerIndexSettingsDiff { } pub fn reindex_vectors(&self) -> bool { - self.embedding_configs_updated + !self.embedding_config_updates.is_empty() } pub fn settings_update_only(&self) -> bool { From d1dd7e5d097dea50d85d49c3a14ffbef62f46bb7 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 12 Jun 2024 14:05:52 +0200 Subject: [PATCH 080/110] In transform for removed embedders, write back their user provided vectors in documents, and clear the writers --- milli/src/update/index_documents/transform.rs | 118 +++++++++++++++++- 1 file changed, 114 insertions(+), 4 deletions(-) diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index c34b7876a..f58ffebf0 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -1,7 +1,7 @@ use std::borrow::Cow; use std::collections::btree_map::Entry as BEntry; use std::collections::hash_map::Entry as HEntry; -use std::collections::{HashMap, HashSet}; +use std::collections::{BTreeMap, HashMap, HashSet}; use std::fs::File; use std::io::{Read, Seek}; @@ -27,6 +27,7 @@ use crate::update::del_add::{ use crate::update::index_documents::GrenadParameters; use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff}; use crate::update::{AvailableDocumentsIds, UpdateIndexingStep}; +use crate::vector::settings::{EmbedderAction, WriteBackToDocuments}; use crate::{ is_faceted_by, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result, }; @@ -808,13 +809,13 @@ impl<'a, 'i> Transform<'a, 'i> { let mut new_inner_settings = old_inner_settings.clone(); new_inner_settings.fields_ids_map = fields_ids_map; - let embedding_configs_updated = false; + let embedding_config_updates = Default::default(); let settings_update_only = false; let settings_diff = InnerIndexSettingsDiff::new( old_inner_settings, new_inner_settings, primary_key_id, - embedding_configs_updated, + embedding_config_updates, settings_update_only, ); @@ -835,10 +836,13 @@ impl<'a, 'i> Transform<'a, 'i> { /// Rebind the field_ids of the provided document to their values /// based on the field_ids_maps difference between the old and the new settings, /// then fill the provided buffers with delta documents using KvWritterDelAdd. + #[allow(clippy::too_many_arguments)] // need the vectors + fid, feel free to create a struct xo xo fn rebind_existing_document( old_obkv: KvReader, settings_diff: &InnerIndexSettingsDiff, modified_faceted_fields: &HashSet, + mut injected_vectors: serde_json::Map, + old_vectors_fid: Option, original_obkv_buffer: Option<&mut Vec>, flattened_obkv_buffer: Option<&mut Vec>, ) -> Result<()> { @@ -863,7 +867,36 @@ impl<'a, 'i> Transform<'a, 'i> { let mut operations = HashMap::new(); let mut obkv_writer = KvWriter::<_, FieldId>::memory(); - for (id, val) in old_obkv.iter() { + 'write_fid: for (id, val) in old_obkv.iter() { + if !injected_vectors.is_empty() { + 'inject_vectors: { + let Some(vectors_fid) = old_vectors_fid else { break 'inject_vectors }; + + if id != vectors_fid { + break 'inject_vectors; + } + + let existing_vectors: std::result::Result< + serde_json::Map, + serde_json::Error, + > = serde_json::from_slice(val); + + let mut existing_vectors = match existing_vectors { + Ok(existing_vectors) => existing_vectors, + Err(error) => { + tracing::error!(%error, "Unexpected `_vectors` field that is not a map. Treating as an empty map"); + Default::default() + } + }; + + existing_vectors.append(&mut injected_vectors); + + operations.insert(id, DelAddOperation::DeletionAndAddition); + obkv_writer.insert(id, serde_json::to_vec(&existing_vectors).unwrap())?; + continue 'write_fid; + } + } + if is_primary_key(id) || necessary_faceted_field(id) || reindex_vectors { operations.insert(id, DelAddOperation::DeletionAndAddition); obkv_writer.insert(id, val)?; @@ -937,6 +970,35 @@ impl<'a, 'i> Transform<'a, 'i> { None }; + let readers: Result< + BTreeMap<&str, (Vec>, &RoaringBitmap)>, + > = settings_diff + .embedding_config_updates + .iter() + .filter_map(|(name, action)| { + if let EmbedderAction::WriteBackToDocuments(WriteBackToDocuments { + embedder_id, + user_provided, + }) = action + { + let readers: Result> = + self.index.arroy_readers(wtxn, *embedder_id).collect(); + match readers { + Ok(readers) => Some(Ok((name.as_str(), (readers, user_provided)))), + Err(error) => Some(Err(error)), + } + } else { + None + } + }) + .collect(); + let readers = readers?; + + let old_vectors_fid = settings_diff + .old + .fields_ids_map + .id(crate::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME); + // We initialize the sorter with the user indexing settings. let mut flattened_sorter = if settings_diff.reindex_searchable() || settings_diff.reindex_facets() { @@ -963,10 +1025,41 @@ impl<'a, 'i> Transform<'a, 'i> { InternalError::DatabaseMissingEntry { db_name: db_name::DOCUMENTS, key: None }, )?; + let injected_vectors: std::result::Result< + serde_json::Map, + arroy::Error, + > = readers + .iter() + .filter_map(|(name, (readers, user_provided))| { + if !user_provided.contains(docid) { + return None; + } + let mut vectors = Vec::new(); + for reader in readers { + let Some(vector) = reader.item_vector(wtxn, docid).transpose() else { + break; + }; + + match vector { + Ok(vector) => vectors.push(vector), + Err(error) => return Some(Err(error)), + } + } + if vectors.is_empty() { + return None; + } + Some(Ok((name.to_string(), serde_json::to_value(vectors).unwrap()))) + }) + .collect(); + + let injected_vectors = injected_vectors?; + Self::rebind_existing_document( old_obkv, &settings_diff, &modified_faceted_fields, + injected_vectors, + old_vectors_fid, Some(&mut original_obkv_buffer).filter(|_| original_sorter.is_some()), Some(&mut flattened_obkv_buffer).filter(|_| flattened_sorter.is_some()), )?; @@ -983,6 +1076,23 @@ impl<'a, 'i> Transform<'a, 'i> { } } + let mut writers = Vec::new(); + + // delete all vectors from the embedders that need removal + for (_, (readers, _)) in readers { + for reader in readers { + let dimensions = reader.dimensions(); + let arroy_index = reader.index(); + drop(reader); + let writer = arroy::Writer::new(self.index.vector_arroy, arroy_index, dimensions); + writers.push(writer); + } + } + + for writer in writers { + writer.clear(wtxn)?; + } + let grenad_params = GrenadParameters { chunk_compression_type: self.indexer_settings.chunk_compression_type, chunk_compression_level: self.indexer_settings.chunk_compression_level, From f5cf01e7d1efc5c383837826eabc4d887a957374 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 12 Jun 2024 14:11:29 +0200 Subject: [PATCH 081/110] Rework extraction to use EmbedderAction --- .../extract/extract_vector_points.rs | 461 ++++++++++++------ .../src/update/index_documents/extract/mod.rs | 4 +- milli/src/update/index_documents/mod.rs | 4 +- .../src/update/index_documents/typed_chunk.rs | 10 +- 4 files changed, 318 insertions(+), 161 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_vector_points.rs b/milli/src/update/index_documents/extract/extract_vector_points.rs index 48e3e697a..fdf8649f4 100644 --- a/milli/src/update/index_documents/extract/extract_vector_points.rs +++ b/milli/src/update/index_documents/extract/extract_vector_points.rs @@ -17,9 +17,10 @@ use crate::index::IndexEmbeddingConfig; use crate::prompt::Prompt; use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; use crate::update::settings::InnerIndexSettingsDiff; -use crate::vector::parsed_vectors::{ParsedVectorsDiff, RESERVED_VECTORS_FIELD_NAME}; +use crate::vector::parsed_vectors::{ParsedVectorsDiff, VectorState, RESERVED_VECTORS_FIELD_NAME}; +use crate::vector::settings::{EmbedderAction, ReindexAction}; use crate::vector::Embedder; -use crate::{try_split_array_at, DocumentId, Result, ThreadPoolNoAbort}; +use crate::{try_split_array_at, DocumentId, FieldId, FieldsIdsMap, Result, ThreadPoolNoAbort}; /// The length of the elements that are always in the buffer when inserting new values. const TRUNCATE_SIZE: usize = size_of::(); @@ -35,7 +36,7 @@ pub struct ExtractedVectorPoints { // embedder pub embedder_name: String, pub embedder: Arc, - pub user_provided: RoaringBitmap, + pub add_to_user_provided: RoaringBitmap, pub remove_from_user_provided: RoaringBitmap, } @@ -44,12 +45,7 @@ enum VectorStateDelta { // Remove all vectors, generated or manual, from this document NowRemoved, - // Add the manually specified vectors, passed in the other grenad - // Remove any previously generated vectors - // Note: changing the value of the manually specified vector **should not record** this delta - WasGeneratedNowManual(Vec>), - - ManualDelta(Vec>), + NowManual(Vec>), // Add the vector computed from the specified prompt // Remove any previous vector @@ -62,9 +58,8 @@ impl VectorStateDelta { match self { VectorStateDelta::NoChange => Default::default(), VectorStateDelta::NowRemoved => (true, Default::default(), Default::default()), - VectorStateDelta::WasGeneratedNowManual(add) => (true, Default::default(), add), // We always delete the previous vectors - VectorStateDelta::ManualDelta(add) => (true, Default::default(), add), + VectorStateDelta::NowManual(add) => (true, Default::default(), add), VectorStateDelta::NowGenerated(prompt) => (true, prompt, Default::default()), } } @@ -75,19 +70,29 @@ struct EmbedderVectorExtractor { embedder: Arc, prompt: Arc, - // (docid, _index) -> KvWriterDelAdd -> Vector - manual_vectors_writer: Writer>, // (docid) -> (prompt) prompts_writer: Writer>, // (docid) -> () remove_vectors_writer: Writer>, - + // (docid, _index) -> KvWriterDelAdd -> Vector + manual_vectors_writer: Writer>, // The docids of the documents that contains a user defined embedding - user_provided: RoaringBitmap, + add_to_user_provided: RoaringBitmap, + + action: ExtractionAction, +} + +struct DocumentOperation { // The docids of the documents that contains an auto-generated embedding remove_from_user_provided: RoaringBitmap, } +enum ExtractionAction { + SettingsFullReindex, + SettingsRegeneratePrompts { old_prompt: Arc }, + DocumentOperation(DocumentOperation), +} + /// Extracts the embedding vector contained in each document under the `_vectors` field. /// /// Returns the generated grenad reader containing the docid as key associated to the Vec @@ -104,46 +109,109 @@ pub fn extract_vector_points( let new_fields_ids_map = &settings_diff.new.fields_ids_map; // the vector field id may have changed let old_vectors_fid = old_fields_ids_map.id(RESERVED_VECTORS_FIELD_NAME); - // filter the old vector fid if the settings has been changed forcing reindexing. - let old_vectors_fid = old_vectors_fid.filter(|_| !reindex_vectors); let new_vectors_fid = new_fields_ids_map.id(RESERVED_VECTORS_FIELD_NAME); let mut extractors = Vec::new(); - for (embedder_name, (embedder, prompt)) in - settings_diff.new.embedding_configs.clone().into_iter() - { - // (docid, _index) -> KvWriterDelAdd -> Vector - let manual_vectors_writer = create_writer( - indexer.chunk_compression_type, - indexer.chunk_compression_level, - tempfile::tempfile()?, - ); - // (docid) -> (prompt) - let prompts_writer = create_writer( - indexer.chunk_compression_type, - indexer.chunk_compression_level, - tempfile::tempfile()?, - ); + let mut configs = settings_diff.new.embedding_configs.clone().into_inner(); + let old_configs = &settings_diff.old.embedding_configs; - // (docid) -> () - let remove_vectors_writer = create_writer( - indexer.chunk_compression_type, - indexer.chunk_compression_level, - tempfile::tempfile()?, - ); + if reindex_vectors { + for (name, action) in settings_diff.embedding_config_updates.iter() { + match action { + EmbedderAction::WriteBackToDocuments(_) => continue, // already deleted + EmbedderAction::Reindex(action) => { + let Some((embedder_name, (embedder, prompt))) = configs.remove_entry(name) + else { + tracing::error!(embedder = name, "Requested embedder config not found"); + continue; + }; - extractors.push(EmbedderVectorExtractor { - embedder_name, - embedder, - prompt, - manual_vectors_writer, - prompts_writer, - remove_vectors_writer, - user_provided: RoaringBitmap::new(), - remove_from_user_provided: RoaringBitmap::new(), - }); + // (docid, _index) -> KvWriterDelAdd -> Vector + let manual_vectors_writer = create_writer( + indexer.chunk_compression_type, + indexer.chunk_compression_level, + tempfile::tempfile()?, + ); + + // (docid) -> (prompt) + let prompts_writer = create_writer( + indexer.chunk_compression_type, + indexer.chunk_compression_level, + tempfile::tempfile()?, + ); + + // (docid) -> () + let remove_vectors_writer = create_writer( + indexer.chunk_compression_type, + indexer.chunk_compression_level, + tempfile::tempfile()?, + ); + + let action = match action { + ReindexAction::FullReindex => ExtractionAction::SettingsFullReindex, + ReindexAction::RegeneratePrompts => { + let Some((_, old_prompt)) = old_configs.get(name) else { + tracing::error!(embedder = name, "Old embedder config not found"); + continue; + }; + + ExtractionAction::SettingsRegeneratePrompts { old_prompt } + } + }; + + extractors.push(EmbedderVectorExtractor { + embedder_name, + embedder, + prompt, + prompts_writer, + remove_vectors_writer, + manual_vectors_writer, + add_to_user_provided: RoaringBitmap::new(), + action, + }); + } + } + } + } else { + // document operation + + for (embedder_name, (embedder, prompt)) in configs.into_iter() { + // (docid, _index) -> KvWriterDelAdd -> Vector + let manual_vectors_writer = create_writer( + indexer.chunk_compression_type, + indexer.chunk_compression_level, + tempfile::tempfile()?, + ); + + // (docid) -> (prompt) + let prompts_writer = create_writer( + indexer.chunk_compression_type, + indexer.chunk_compression_level, + tempfile::tempfile()?, + ); + + // (docid) -> () + let remove_vectors_writer = create_writer( + indexer.chunk_compression_type, + indexer.chunk_compression_level, + tempfile::tempfile()?, + ); + + extractors.push(EmbedderVectorExtractor { + embedder_name, + embedder, + prompt, + prompts_writer, + remove_vectors_writer, + manual_vectors_writer, + add_to_user_provided: RoaringBitmap::new(), + action: ExtractionAction::DocumentOperation(DocumentOperation { + remove_from_user_provided: RoaringBitmap::new(), + }), + }); + } } let mut key_buffer = Vec::new(); @@ -177,111 +245,66 @@ pub fn extract_vector_points( embedder_name, embedder: _, prompt, - manual_vectors_writer, prompts_writer, remove_vectors_writer, - user_provided, - remove_from_user_provided, + manual_vectors_writer, + add_to_user_provided, + action, } in extractors.iter_mut() { - let delta = match parsed_vectors.remove(embedder_name) { - (Some(old), Some(new)) => { - match (old.map_or(true, |old| old.is_user_provided()), new.is_user_provided()) { - (true, true) | (false, false) => (), - (true, false) => { - remove_from_user_provided.insert(docid); + let (old, new) = parsed_vectors.remove(embedder_name); + let delta = match action { + ExtractionAction::SettingsFullReindex => match old { + // A full reindex can be triggered either by: + // 1. a new embedder + // 2. an existing embedder changed so that it must regenerate all generated embeddings. + // For a new embedder, there can be `_vectors.embedder` embeddings to add to the DB + VectorState::Inline(vectors) => { + if vectors.is_user_provided() { + add_to_user_provided.insert(docid); } - (false, true) => { - user_provided.insert(docid); + let add_vectors = vectors.into_array_of_vectors(); + + if add_vectors.len() > usize::from(u8::MAX) { + return Err(crate::Error::UserError(crate::UserError::TooManyVectors( + document_id().to_string(), + add_vectors.len(), + ))); } + + VectorStateDelta::NowManual(add_vectors) } - - // no autogeneration - let add_vectors = new.into_array_of_vectors(); - - if add_vectors.len() > usize::from(u8::MAX) { - return Err(crate::Error::UserError(crate::UserError::TooManyVectors( - document_id().to_string(), - add_vectors.len(), - ))); - } - - VectorStateDelta::ManualDelta(add_vectors) - } - (Some(old), None) => { - // Do we keep this document? - let document_is_kept = obkv - .iter() - .map(|(_, deladd)| KvReaderDelAdd::new(deladd)) - .any(|deladd| deladd.get(DelAdd::Addition).is_some()); - if document_is_kept && old.is_some() { - remove_from_user_provided.insert(docid); - // becomes autogenerated - VectorStateDelta::NowGenerated(prompt.render( + // this happens only when an existing embedder changed. We cannot regenerate userProvided vectors + VectorState::InDb => VectorStateDelta::NoChange, + // generated vectors must be regenerated + VectorState::Generated => regenerate_prompt(obkv, prompt, new_fields_ids_map)?, + }, + // prompt regeneration is only triggered for existing embedders + ExtractionAction::SettingsRegeneratePrompts { old_prompt } => { + if !old.is_user_provided() { + regenerate_if_prompt_changed( obkv, - DelAdd::Addition, - new_fields_ids_map, - )?) - } else if document_is_kept && old.is_none() { + (old_prompt, prompt), + (&old_fields_ids_map, &new_fields_ids_map), + )? + } else { + // we can simply ignore user provided vectors as they are not regenerated and are + // already in the DB since this is an existing embedder VectorStateDelta::NoChange - } else { - remove_from_user_provided.insert(docid); - VectorStateDelta::NowRemoved - } - } - (None, Some(new)) => { - if new.is_user_provided() { - user_provided.insert(docid); - } else { - remove_from_user_provided.insert(docid); - } - // was possibly autogenerated, remove all vectors for that document - let add_vectors = new.into_array_of_vectors(); - if add_vectors.len() > usize::from(u8::MAX) { - return Err(crate::Error::UserError(crate::UserError::TooManyVectors( - document_id().to_string(), - add_vectors.len(), - ))); - } - - VectorStateDelta::WasGeneratedNowManual(add_vectors) - } - (None, None) => { - // Do we keep this document? - let document_is_kept = obkv - .iter() - .map(|(_, deladd)| KvReaderDelAdd::new(deladd)) - .any(|deladd| deladd.get(DelAdd::Addition).is_some()); - - if document_is_kept { - // Don't give up if the old prompt was failing - let old_prompt = Some(&prompt) - // TODO: this filter works because we erase the vec database when a embedding setting changes. - // When vector pipeline will be optimized, this should be removed. - .filter(|_| !settings_diff.reindex_vectors()) - .map(|p| { - p.render(obkv, DelAdd::Deletion, old_fields_ids_map) - .unwrap_or_default() - }); - let new_prompt = - prompt.render(obkv, DelAdd::Addition, new_fields_ids_map)?; - if old_prompt.as_ref() != Some(&new_prompt) { - let old_prompt = old_prompt.unwrap_or_default(); - tracing::trace!( - "🚀 Changing prompt from\n{old_prompt}\n===to===\n{new_prompt}" - ); - VectorStateDelta::NowGenerated(new_prompt) - } else { - tracing::trace!("⏭️ Prompt unmodified, skipping"); - VectorStateDelta::NoChange - } - } else { - remove_from_user_provided.remove(docid); - VectorStateDelta::NowRemoved } } + ExtractionAction::DocumentOperation(DocumentOperation { + remove_from_user_provided, + }) => extract_vector_document_diff( + docid, + obkv, + prompt, + (add_to_user_provided, remove_from_user_provided), + (old, new), + (&old_fields_ids_map, &new_fields_ids_map), + document_id, + )?, }; - // and we finally push the unique vectors into the writer push_vectors_diff( remove_vectors_writer, @@ -289,7 +312,6 @@ pub fn extract_vector_points( manual_vectors_writer, &mut key_buffer, delta, - reindex_vectors, )?; } } @@ -300,20 +322,30 @@ pub fn extract_vector_points( embedder_name, embedder, prompt: _, - manual_vectors_writer, prompts_writer, remove_vectors_writer, - user_provided, - remove_from_user_provided, + action, + manual_vectors_writer, + add_to_user_provided, } in extractors { + let remove_from_user_provided = + if let ExtractionAction::DocumentOperation(DocumentOperation { + remove_from_user_provided, + }) = action + { + remove_from_user_provided + } else { + Default::default() + }; + results.push(ExtractedVectorPoints { manual_vectors: writer_into_reader(manual_vectors_writer)?, remove_vectors: writer_into_reader(remove_vectors_writer)?, prompts: writer_into_reader(prompts_writer)?, embedder, embedder_name, - user_provided, + add_to_user_provided, remove_from_user_provided, }) } @@ -321,6 +353,136 @@ pub fn extract_vector_points( Ok(results) } +fn extract_vector_document_diff( + docid: DocumentId, + obkv: obkv::KvReader<'_, FieldId>, + prompt: &Prompt, + (add_to_user_provided, remove_from_user_provided): (&mut RoaringBitmap, &mut RoaringBitmap), + (old, new): (VectorState, VectorState), + (old_fields_ids_map, new_fields_ids_map): (&FieldsIdsMap, &FieldsIdsMap), + document_id: impl Fn() -> Value, +) -> Result { + match (old.is_user_provided(), new.is_user_provided()) { + (true, true) | (false, false) => {} + (true, false) => { + remove_from_user_provided.insert(docid); + } + (false, true) => { + add_to_user_provided.insert(docid); + } + } + + let delta = match (old, new) { + // regardless of the previous state, if a document now contains inline _vectors, they must + // be extracted manually + (_old, VectorState::Inline(new)) => { + let add_vectors = new.into_array_of_vectors(); + + if add_vectors.len() > usize::from(u8::MAX) { + return Err(crate::Error::UserError(crate::UserError::TooManyVectors( + document_id().to_string(), + add_vectors.len(), + ))); + } + + VectorStateDelta::NowManual(add_vectors) + } + // no `_vectors` anywhere, we check for document removal and otherwise we regenerate the prompt if the + // document changed + (VectorState::Generated, VectorState::Generated) => { + // Do we keep this document? + let document_is_kept = obkv + .iter() + .map(|(_, deladd)| KvReaderDelAdd::new(deladd)) + .any(|deladd| deladd.get(DelAdd::Addition).is_some()); + + if document_is_kept { + // Don't give up if the old prompt was failing + let old_prompt = Some(&prompt).map(|p| { + p.render(obkv, DelAdd::Deletion, old_fields_ids_map).unwrap_or_default() + }); + let new_prompt = prompt.render(obkv, DelAdd::Addition, new_fields_ids_map)?; + if old_prompt.as_ref() != Some(&new_prompt) { + let old_prompt = old_prompt.unwrap_or_default(); + tracing::trace!( + "🚀 Changing prompt from\n{old_prompt}\n===to===\n{new_prompt}" + ); + VectorStateDelta::NowGenerated(new_prompt) + } else { + tracing::trace!("⏭️ Prompt unmodified, skipping"); + VectorStateDelta::NoChange + } + } else { + VectorStateDelta::NowRemoved + } + } + // when the vectors are no longer user-provided, + // we generate the prompt unconditionally + (_not_generated, VectorState::Generated) => { + // Do we keep this document? + let document_is_kept = obkv + .iter() + .map(|(_, deladd)| KvReaderDelAdd::new(deladd)) + .any(|deladd| deladd.get(DelAdd::Addition).is_some()); + if document_is_kept { + // becomes autogenerated + VectorStateDelta::NowGenerated(prompt.render( + obkv, + DelAdd::Addition, + new_fields_ids_map, + )?) + } else { + // make sure the document is always removed from user provided on removal + remove_from_user_provided.insert(docid); + VectorStateDelta::NowRemoved + } + } + (_old, VectorState::InDb) => { + // Do we keep this document? + let document_is_kept = obkv + .iter() + .map(|(_, deladd)| KvReaderDelAdd::new(deladd)) + .any(|deladd| deladd.get(DelAdd::Addition).is_some()); + if document_is_kept { + // if the new version of documents has the vectors in the DB, + // then they are user-provided and nothing possibly changed + VectorStateDelta::NoChange + } else { + // make sure the document is always removed from user provided on removal + remove_from_user_provided.insert(docid); + VectorStateDelta::NowRemoved + } + } + }; + + Ok(delta) +} + +fn regenerate_if_prompt_changed( + obkv: obkv::KvReader<'_, FieldId>, + (old_prompt, new_prompt): (&Prompt, &Prompt), + (old_fields_ids_map, new_fields_ids_map): (&FieldsIdsMap, &FieldsIdsMap), +) -> Result { + let old_prompt = + old_prompt.render(obkv, DelAdd::Deletion, old_fields_ids_map).unwrap_or(Default::default()); + let new_prompt = new_prompt.render(obkv, DelAdd::Addition, new_fields_ids_map)?; + + if new_prompt == old_prompt { + return Ok(VectorStateDelta::NoChange); + } + Ok(VectorStateDelta::NowGenerated(new_prompt)) +} + +fn regenerate_prompt( + obkv: obkv::KvReader<'_, FieldId>, + prompt: &Prompt, + new_fields_ids_map: &FieldsIdsMap, +) -> Result { + let prompt = prompt.render(obkv, DelAdd::Addition, new_fields_ids_map)?; + + Ok(VectorStateDelta::NowGenerated(prompt)) +} + /// We cannot compute the diff between both Del and Add vectors. /// We'll push every vector and compute the difference later in TypedChunk. fn push_vectors_diff( @@ -329,14 +491,9 @@ fn push_vectors_diff( manual_vectors_writer: &mut Writer>, key_buffer: &mut Vec, delta: VectorStateDelta, - reindex_vectors: bool, ) -> Result<()> { let (must_remove, prompt, mut add_vectors) = delta.into_values(); - if must_remove - // TODO: the below condition works because we erase the vec database when a embedding setting changes. - // When vector pipeline will be optimized, this should be removed. - && !reindex_vectors - { + if must_remove { key_buffer.truncate(TRUNCATE_SIZE); remove_vectors_writer.insert(&key_buffer, [])?; } diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 2babe330f..9da3983fc 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -248,7 +248,7 @@ fn send_original_documents_data( prompts, embedder_name, embedder, - user_provided, + add_to_user_provided, remove_from_user_provided, } in extracted_vectors { @@ -274,7 +274,7 @@ fn send_original_documents_data( expected_dimension: embedder.dimensions(), manual_vectors, embedder_name, - user_provided, + add_to_user_provided, remove_from_user_provided, })); } diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index a533f1984..3586c9c6d 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -503,7 +503,7 @@ where embeddings, manual_vectors, embedder_name, - user_provided, + add_to_user_provided, remove_from_user_provided, } => { dimension.insert(embedder_name.clone(), expected_dimension); @@ -513,7 +513,7 @@ where expected_dimension, manual_vectors, embedder_name, - user_provided, + add_to_user_provided, remove_from_user_provided, } } diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 0cb5e58af..4737c6b42 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -91,7 +91,7 @@ pub(crate) enum TypedChunk { expected_dimension: usize, manual_vectors: grenad::Reader>, embedder_name: String, - user_provided: RoaringBitmap, + add_to_user_provided: RoaringBitmap, remove_from_user_provided: RoaringBitmap, }, ScriptLanguageDocids(HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>), @@ -625,7 +625,7 @@ pub(crate) fn write_typed_chunk_into_index( let mut remove_vectors_builder = MergerBuilder::new(keep_first as MergeFn); let mut manual_vectors_builder = MergerBuilder::new(keep_first as MergeFn); let mut embeddings_builder = MergerBuilder::new(keep_first as MergeFn); - let mut user_provided = RoaringBitmap::new(); + let mut add_to_user_provided = RoaringBitmap::new(); let mut remove_from_user_provided = RoaringBitmap::new(); let mut params = None; for typed_chunk in typed_chunks { @@ -635,7 +635,7 @@ pub(crate) fn write_typed_chunk_into_index( embeddings, expected_dimension, embedder_name, - user_provided: ud, + add_to_user_provided: aud, remove_from_user_provided: rud, } = typed_chunk else { @@ -649,7 +649,7 @@ pub(crate) fn write_typed_chunk_into_index( if let Some(embeddings) = embeddings { embeddings_builder.push(embeddings.into_cursor()?); } - user_provided |= ud; + add_to_user_provided |= aud; remove_from_user_provided |= rud; } @@ -662,7 +662,7 @@ pub(crate) fn write_typed_chunk_into_index( .find(|IndexEmbeddingConfig { name, .. }| name == &embedder_name) .unwrap(); index_embedder_config.user_provided -= remove_from_user_provided; - index_embedder_config.user_provided |= user_provided; + index_embedder_config.user_provided |= add_to_user_provided; index.put_embedding_configs(wtxn, embedding_configs)?; From fca9fe39b35ab513ec3d505a58c34d476ee443e4 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 12 Jun 2024 14:49:38 +0200 Subject: [PATCH 082/110] Update test snapshots --- index-scheduler/src/lib.rs | 6 ++---- meilisearch/tests/vector/settings.rs | 9 ++------- 2 files changed, 4 insertions(+), 11 deletions(-) diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index e2a6f03a0..fd7f29f6c 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -5579,7 +5579,6 @@ mod tests { .collect::>(); snapshot!(serde_json::to_string(&documents).unwrap(), @r###"[{"id":0,"doggo":"kefir"}]"###); let conf = index.embedding_configs(&rtxn).unwrap(); - // TODO: Here the user provided vectors should NOT contains 1 snapshot!(format!("{conf:#?}"), @r###" [ IndexEmbeddingConfig { @@ -5595,7 +5594,7 @@ mod tests { template: "{% for field in fields %} {{ field.name }}: {{ field.value }}\n{% endfor %}", }, }, - user_provided: RoaringBitmap<[0, 1]>, + user_provided: RoaringBitmap<[0]>, }, ] "###); @@ -5620,7 +5619,6 @@ mod tests { .collect::>(); snapshot!(serde_json::to_string(&documents).unwrap(), @"[]"); let conf = index.embedding_configs(&rtxn).unwrap(); - // TODO: Here the user provided vectors should contains nothing snapshot!(format!("{conf:#?}"), @r###" [ IndexEmbeddingConfig { @@ -5636,7 +5634,7 @@ mod tests { template: "{% for field in fields %} {{ field.name }}: {{ field.value }}\n{% endfor %}", }, }, - user_provided: RoaringBitmap<[0, 1]>, + user_provided: RoaringBitmap<[]>, }, ] "###); diff --git a/meilisearch/tests/vector/settings.rs b/meilisearch/tests/vector/settings.rs index 6b93f001e..e11f4368f 100644 --- a/meilisearch/tests/vector/settings.rs +++ b/meilisearch/tests/vector/settings.rs @@ -43,7 +43,7 @@ async fn update_embedder() { { "uid": 1, "indexUid": "doggo", - "status": "failed", + "status": "succeeded", "type": "settingsUpdate", "canceledBy": null, "details": { @@ -54,12 +54,7 @@ async fn update_embedder() { } } }, - "error": { - "message": "`.embedders.manual`: Field `model` unavailable for source `userProvided` (only available for sources: `huggingFace`, `openAi`, `ollama`). Available fields: `source`, `dimensions`, `distribution`", - "code": "invalid_settings_embedders", - "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#invalid_settings_embedders" - }, + "error": null, "duration": "[duration]", "enqueuedAt": "[date]", "startedAt": "[date]", From 34fabed214d92e607ed862e0b51a5fe8c3e93199 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 12 Jun 2024 17:09:34 +0200 Subject: [PATCH 083/110] Add test for vector writeback --- index-scheduler/src/lib.rs | 167 +++++++++++++++++++++++++++++++++++++ 1 file changed, 167 insertions(+) diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index fd7f29f6c..4278d15b3 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -5639,4 +5639,171 @@ mod tests { ] "###); } + + #[test] + fn delete_embedder_with_user_provided_vectors() { + // 1. Add two embedders + // 2. Push two documents containing a simple vector + // 3. The documents must not contain the vectors after the update as they are in the vectors db + // 3. Delete the embedders + // 4. The documents contain the vectors again + let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); + + let setting = meilisearch_types::settings::Settings:: { + embedders: Setting::Set(maplit::btreemap! { + S("manual") => Setting::Set(EmbeddingSettings { + source: Setting::Set(milli::vector::settings::EmbedderSource::UserProvided), + dimensions: Setting::Set(3), + ..Default::default() + }), + S("my_doggo_embedder") => Setting::Set(EmbeddingSettings { + source: Setting::Set(milli::vector::settings::EmbedderSource::HuggingFace), + model: Setting::Set(S("sentence-transformers/all-MiniLM-L6-v2")), + revision: Setting::Set(S("e4ce9877abf3edfe10b0d82785e83bdcb973e22e")), + document_template: Setting::Set(S("{{doc.doggo}}")), + ..Default::default() + }), + }), + ..Default::default() + }; + index_scheduler + .register( + KindWithContent::SettingsUpdate { + index_uid: S("doggos"), + new_settings: Box::new(setting), + is_deletion: false, + allow_index_creation: true, + }, + None, + false, + ) + .unwrap(); + handle.advance_one_successful_batch(); + + let content = serde_json::json!( + [ + { + "id": 0, + "doggo": "kefir", + "_vectors": { + "manual": vec![0, 0, 0], + "my_doggo_embedder": vec![1; 384], + } + }, + { + "id": 1, + "doggo": "intel", + "_vectors": { + "manual": vec![1, 1, 1], + } + }, + ] + ); + + let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0_u128).unwrap(); + let documents_count = + read_json(serde_json::to_string_pretty(&content).unwrap().as_bytes(), &mut file) + .unwrap(); + snapshot!(documents_count, @"2"); + file.persist().unwrap(); + + index_scheduler + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: None, + method: ReplaceDocuments, + content_file: uuid, + documents_count, + allow_index_creation: false, + }, + None, + false, + ) + .unwrap(); + handle.advance_one_successful_batch(); + + { + let index = index_scheduler.index("doggos").unwrap(); + let rtxn = index.read_txn().unwrap(); + let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let field_ids = field_ids_map.ids().collect::>(); + let documents = index + .all_documents(&rtxn) + .unwrap() + .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) + .collect::>(); + snapshot!(serde_json::to_string(&documents).unwrap(), @r###"[{"id":0,"doggo":"kefir"},{"id":1,"doggo":"intel"}]"###); + } + + { + let setting = meilisearch_types::settings::Settings:: { + embedders: Setting::Set(maplit::btreemap! { + S("manual") => Setting::Reset, + }), + ..Default::default() + }; + index_scheduler + .register( + KindWithContent::SettingsUpdate { + index_uid: S("doggos"), + new_settings: Box::new(setting), + is_deletion: false, + allow_index_creation: true, + }, + None, + false, + ) + .unwrap(); + handle.advance_one_successful_batch(); + } + + { + let index = index_scheduler.index("doggos").unwrap(); + let rtxn = index.read_txn().unwrap(); + let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let field_ids = field_ids_map.ids().collect::>(); + let documents = index + .all_documents(&rtxn) + .unwrap() + .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) + .collect::>(); + snapshot!(serde_json::to_string(&documents).unwrap(), @r###"[{"id":0,"doggo":"kefir","_vectors":{"manual":{"embeddings":[[0.0,0.0,0.0]],"userProvided":true}}},{"id":1,"doggo":"intel","_vectors":{"manual":{"embeddings":[[1.0,1.0,1.0]],"userProvided":true}}}]"###); + } + + { + let setting = meilisearch_types::settings::Settings:: { + embedders: Setting::Reset, + ..Default::default() + }; + index_scheduler + .register( + KindWithContent::SettingsUpdate { + index_uid: S("doggos"), + new_settings: Box::new(setting), + is_deletion: false, + allow_index_creation: true, + }, + None, + false, + ) + .unwrap(); + handle.advance_one_successful_batch(); + } + + { + let index = index_scheduler.index("doggos").unwrap(); + let rtxn = index.read_txn().unwrap(); + let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let field_ids = field_ids_map.ids().collect::>(); + let documents = index + .all_documents(&rtxn) + .unwrap() + .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) + .collect::>(); + + /// FIXME: redaction + snapshot!(json_string!(serde_json::to_string(&documents).unwrap(), { "[]._vectors.doggo_embedder.embeddings" => "[vector]" }), @r###""[{\"id\":0,\"doggo\":\"kefir\",\"_vectors\":{\"manual\":{\"embeddings\":[[0.0,0.0,0.0]],\"userProvided\":true},\"my_doggo_embedder\":{\"embeddings\":[[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0]],\"userProvided\":true}}},{\"id\":1,\"doggo\":\"intel\",\"_vectors\":{\"manual\":{\"embeddings\":[[1.0,1.0,1.0]],\"userProvided\":true}}}]""###); + } + } } From a89eea233bbd8d5bda76e6f1a195485639a31bc4 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 12 Jun 2024 17:10:19 +0200 Subject: [PATCH 084/110] Fix vectors injection --- milli/src/update/index_documents/transform.rs | 52 ++++++++++++++----- 1 file changed, 38 insertions(+), 14 deletions(-) diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index f58ffebf0..b2fe04a4c 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -27,6 +27,7 @@ use crate::update::del_add::{ use crate::update::index_documents::GrenadParameters; use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff}; use crate::update::{AvailableDocumentsIds, UpdateIndexingStep}; +use crate::vector::parsed_vectors::{ExplicitVectors, VectorOrArrayOfVectors}; use crate::vector::settings::{EmbedderAction, WriteBackToDocuments}; use crate::{ is_faceted_by, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result, @@ -872,28 +873,35 @@ impl<'a, 'i> Transform<'a, 'i> { 'inject_vectors: { let Some(vectors_fid) = old_vectors_fid else { break 'inject_vectors }; - if id != vectors_fid { + if id < vectors_fid { break 'inject_vectors; } - let existing_vectors: std::result::Result< - serde_json::Map, - serde_json::Error, - > = serde_json::from_slice(val); + let mut existing_vectors = if id == vectors_fid { + let existing_vectors: std::result::Result< + serde_json::Map, + serde_json::Error, + > = serde_json::from_slice(val); - let mut existing_vectors = match existing_vectors { - Ok(existing_vectors) => existing_vectors, - Err(error) => { - tracing::error!(%error, "Unexpected `_vectors` field that is not a map. Treating as an empty map"); - Default::default() + match existing_vectors { + Ok(existing_vectors) => existing_vectors, + Err(error) => { + tracing::error!(%error, "Unexpected `_vectors` field that is not a map. Treating as an empty map"); + Default::default() + } } + } else { + Default::default() }; existing_vectors.append(&mut injected_vectors); - operations.insert(id, DelAddOperation::DeletionAndAddition); - obkv_writer.insert(id, serde_json::to_vec(&existing_vectors).unwrap())?; - continue 'write_fid; + operations.insert(vectors_fid, DelAddOperation::DeletionAndAddition); + obkv_writer + .insert(vectors_fid, serde_json::to_vec(&existing_vectors).unwrap())?; + if id == vectors_fid { + continue 'write_fid; + } } } @@ -905,6 +913,15 @@ impl<'a, 'i> Transform<'a, 'i> { obkv_writer.insert(id, val)?; } } + if !injected_vectors.is_empty() { + 'inject_vectors: { + let Some(vectors_fid) = old_vectors_fid else { break 'inject_vectors }; + + operations.insert(vectors_fid, DelAddOperation::DeletionAndAddition); + obkv_writer.insert(vectors_fid, serde_json::to_vec(&injected_vectors).unwrap())?; + } + } + let data = obkv_writer.into_inner()?; let obkv = KvReader::::new(&data); @@ -1048,7 +1065,14 @@ impl<'a, 'i> Transform<'a, 'i> { if vectors.is_empty() { return None; } - Some(Ok((name.to_string(), serde_json::to_value(vectors).unwrap()))) + Some(Ok(( + name.to_string(), + serde_json::to_value(ExplicitVectors { + embeddings: VectorOrArrayOfVectors::from_array_of_vectors(vectors), + user_provided: true, + }) + .unwrap(), + ))) }) .collect(); From 3bc8f81abc3f8d57060c1571d6801e50f43ce33f Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 12 Jun 2024 18:11:11 +0200 Subject: [PATCH 085/110] user_provided => regenerate --- index-scheduler/src/batch.rs | 6 +- meilisearch/src/routes/indexes/documents.rs | 5 +- meilisearch/src/search.rs | 3 +- .../extract/extract_vector_points.rs | 56 ++++++++++--------- milli/src/update/index_documents/transform.rs | 6 +- milli/src/vector/parsed_vectors.rs | 34 +++++------ 6 files changed, 62 insertions(+), 48 deletions(-) diff --git a/index-scheduler/src/batch.rs b/index-scheduler/src/batch.rs index 30ff54a62..cd5525eea 100644 --- a/index-scheduler/src/batch.rs +++ b/index-scheduler/src/batch.rs @@ -958,10 +958,10 @@ impl IndexScheduler { .is_some_and(|conf| conf.user_provided.contains(id)); let embeddings = ExplicitVectors { - embeddings: VectorOrArrayOfVectors::from_array_of_vectors( - embeddings, + embeddings: Some( + VectorOrArrayOfVectors::from_array_of_vectors(embeddings), ), - user_provided, + regenerate: !user_provided, }; vectors.insert( embedder_name, diff --git a/meilisearch/src/routes/indexes/documents.rs b/meilisearch/src/routes/indexes/documents.rs index 70623bb35..bfbe20207 100644 --- a/meilisearch/src/routes/indexes/documents.rs +++ b/meilisearch/src/routes/indexes/documents.rs @@ -625,7 +625,10 @@ fn some_documents<'a, 't: 'a>( .iter() .find(|conf| conf.name == name) .is_some_and(|conf| conf.user_provided.contains(key)); - let embeddings = ExplicitVectors { embeddings: vector.into(), user_provided }; + let embeddings = ExplicitVectors { + embeddings: Some(vector.into()), + regenerate: !user_provided, + }; vectors.insert( name, serde_json::to_value(embeddings).map_err(MeilisearchHttpError::from)?, diff --git a/meilisearch/src/search.rs b/meilisearch/src/search.rs index ce712f17f..60f684ede 100644 --- a/meilisearch/src/search.rs +++ b/meilisearch/src/search.rs @@ -1072,7 +1072,8 @@ fn make_hits( .iter() .find(|conf| conf.name == name) .is_some_and(|conf| conf.user_provided.contains(id)); - let embeddings = ExplicitVectors { embeddings: vector.into(), user_provided }; + let embeddings = + ExplicitVectors { embeddings: Some(vector.into()), regenerate: !user_provided }; vectors.insert(name, serde_json::to_value(embeddings)?); } document.insert("_vectors".into(), vectors.into()); diff --git a/milli/src/update/index_documents/extract/extract_vector_points.rs b/milli/src/update/index_documents/extract/extract_vector_points.rs index fdf8649f4..0a27a28bd 100644 --- a/milli/src/update/index_documents/extract/extract_vector_points.rs +++ b/milli/src/update/index_documents/extract/extract_vector_points.rs @@ -260,28 +260,33 @@ pub fn extract_vector_points( // 2. an existing embedder changed so that it must regenerate all generated embeddings. // For a new embedder, there can be `_vectors.embedder` embeddings to add to the DB VectorState::Inline(vectors) => { - if vectors.is_user_provided() { + if !vectors.must_regenerate() { add_to_user_provided.insert(docid); } - let add_vectors = vectors.into_array_of_vectors(); - if add_vectors.len() > usize::from(u8::MAX) { - return Err(crate::Error::UserError(crate::UserError::TooManyVectors( - document_id().to_string(), - add_vectors.len(), - ))); + match vectors.into_array_of_vectors() { + Some(add_vectors) => { + if add_vectors.len() > usize::from(u8::MAX) { + return Err(crate::Error::UserError( + crate::UserError::TooManyVectors( + document_id().to_string(), + add_vectors.len(), + ), + )); + } + VectorStateDelta::NowManual(add_vectors) + } + None => VectorStateDelta::NoChange, } - - VectorStateDelta::NowManual(add_vectors) } // this happens only when an existing embedder changed. We cannot regenerate userProvided vectors - VectorState::InDb => VectorStateDelta::NoChange, + VectorState::Manual => VectorStateDelta::NoChange, // generated vectors must be regenerated VectorState::Generated => regenerate_prompt(obkv, prompt, new_fields_ids_map)?, }, // prompt regeneration is only triggered for existing embedders ExtractionAction::SettingsRegeneratePrompts { old_prompt } => { - if !old.is_user_provided() { + if old.must_regenerate() { regenerate_if_prompt_changed( obkv, (old_prompt, prompt), @@ -362,31 +367,32 @@ fn extract_vector_document_diff( (old_fields_ids_map, new_fields_ids_map): (&FieldsIdsMap, &FieldsIdsMap), document_id: impl Fn() -> Value, ) -> Result { - match (old.is_user_provided(), new.is_user_provided()) { + match (old.must_regenerate(), new.must_regenerate()) { (true, true) | (false, false) => {} (true, false) => { - remove_from_user_provided.insert(docid); + add_to_user_provided.insert(docid); } (false, true) => { - add_to_user_provided.insert(docid); + remove_from_user_provided.insert(docid); } } let delta = match (old, new) { // regardless of the previous state, if a document now contains inline _vectors, they must // be extracted manually - (_old, VectorState::Inline(new)) => { - let add_vectors = new.into_array_of_vectors(); + (_old, VectorState::Inline(new)) => match new.into_array_of_vectors() { + Some(add_vectors) => { + if add_vectors.len() > usize::from(u8::MAX) { + return Err(crate::Error::UserError(crate::UserError::TooManyVectors( + document_id().to_string(), + add_vectors.len(), + ))); + } - if add_vectors.len() > usize::from(u8::MAX) { - return Err(crate::Error::UserError(crate::UserError::TooManyVectors( - document_id().to_string(), - add_vectors.len(), - ))); + VectorStateDelta::NowManual(add_vectors) } - - VectorStateDelta::NowManual(add_vectors) - } + None => VectorStateDelta::NoChange, + }, // no `_vectors` anywhere, we check for document removal and otherwise we regenerate the prompt if the // document changed (VectorState::Generated, VectorState::Generated) => { @@ -437,7 +443,7 @@ fn extract_vector_document_diff( VectorStateDelta::NowRemoved } } - (_old, VectorState::InDb) => { + (_old, VectorState::Manual) => { // Do we keep this document? let document_is_kept = obkv .iter() diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index b2fe04a4c..467a2810a 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -1068,8 +1068,10 @@ impl<'a, 'i> Transform<'a, 'i> { Some(Ok(( name.to_string(), serde_json::to_value(ExplicitVectors { - embeddings: VectorOrArrayOfVectors::from_array_of_vectors(vectors), - user_provided: true, + embeddings: Some(VectorOrArrayOfVectors::from_array_of_vectors( + vectors, + )), + regenerate: false, }) .unwrap(), ))) diff --git a/milli/src/vector/parsed_vectors.rs b/milli/src/vector/parsed_vectors.rs index 9007e03e4..92d6cb382 100644 --- a/milli/src/vector/parsed_vectors.rs +++ b/milli/src/vector/parsed_vectors.rs @@ -18,18 +18,20 @@ pub enum Vectors { } impl Vectors { - pub fn is_user_provided(&self) -> bool { + pub fn must_regenerate(&self) -> bool { match self { - Vectors::ImplicitlyUserProvided(_) => true, - Vectors::Explicit(ExplicitVectors { user_provided, .. }) => *user_provided, + Vectors::ImplicitlyUserProvided(_) => false, + Vectors::Explicit(ExplicitVectors { regenerate, .. }) => *regenerate, } } - pub fn into_array_of_vectors(self) -> Vec { + pub fn into_array_of_vectors(self) -> Option> { match self { - Vectors::ImplicitlyUserProvided(embeddings) - | Vectors::Explicit(ExplicitVectors { embeddings, user_provided: _ }) => { - embeddings.into_array_of_vectors().unwrap_or_default() + Vectors::ImplicitlyUserProvided(embeddings) => { + Some(embeddings.into_array_of_vectors().unwrap_or_default()) + } + Vectors::Explicit(ExplicitVectors { embeddings, regenerate: _ }) => { + embeddings.map(|embeddings| embeddings.into_array_of_vectors().unwrap_or_default()) } } } @@ -38,22 +40,22 @@ impl Vectors { #[derive(serde::Serialize, serde::Deserialize, Debug)] #[serde(rename_all = "camelCase")] pub struct ExplicitVectors { - pub embeddings: VectorOrArrayOfVectors, - pub user_provided: bool, + pub embeddings: Option, + pub regenerate: bool, } pub enum VectorState { Inline(Vectors), - InDb, + Manual, Generated, } impl VectorState { - pub fn is_user_provided(&self) -> bool { + pub fn must_regenerate(&self) -> bool { match self { - VectorState::Inline(vectors) => vectors.is_user_provided(), - VectorState::InDb => true, - VectorState::Generated => false, + VectorState::Inline(vectors) => vectors.must_regenerate(), + VectorState::Manual => false, + VectorState::Generated => true, } } } @@ -96,7 +98,7 @@ impl ParsedVectorsDiff { .flatten().map_or(BTreeMap::default(), |del| del.into_iter().map(|(name, vec)| (name, VectorState::Inline(vec))).collect()); for embedding_config in embedders_configs { if embedding_config.user_provided.contains(docid) { - old.entry(embedding_config.name.to_string()).or_insert(VectorState::InDb); + old.entry(embedding_config.name.to_string()).or_insert(VectorState::Manual); } } @@ -121,7 +123,7 @@ impl ParsedVectorsDiff { let old = self.old.remove(embedder_name).unwrap_or(VectorState::Generated); let state_from_old = match old { // assume a userProvided is still userProvided - VectorState::InDb => VectorState::InDb, + VectorState::Manual => VectorState::Manual, // generated is still generated VectorState::Generated => VectorState::Generated, // weird case that shouldn't happen were the previous docs version is inline, From bc547dad6fb5ecf9cc9f7ccb896b817f1b9eadde Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 12 Jun 2024 18:12:56 +0200 Subject: [PATCH 086/110] Update dump file --- dump/tests/assets/v6-with-vectors.dump | Bin 17539 -> 19136 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/dump/tests/assets/v6-with-vectors.dump b/dump/tests/assets/v6-with-vectors.dump index 9f8ed2ba11d233ed5a1fadd25a18e3d25c7e04a1..8c0505772efde9409111f4aaba1a74714ee89f9a 100644 GIT binary patch literal 19136 zcmV)4K+3-#iwFQs#%X2%1MQvPk{rjC-f24=4$Bw5vaiSE*c%xbW@cqpRv}_9Agwnf zd$poPu57Qo!EMYm$gu`9(EO1IGx#39b@;9P7JdVNzwcxZKoG=`S^%At+ zndi@U&iT%%qffVAJl$TNKG~jMZ_e&N-fpgMF1J@l|NZLX{DU`iq?G1b5BP7NoBx|i z{5!?((?PCt$+fgO%?By>+Uo}g=}le0>yDeN>&>MuDl(^>E*?5x96Ml<1KIaC;#DJ*8lUr{_nv*{2uqm zw;msDzWB*@bF#hs@a@@wyYxS|@^$?$xs?9F!544h2JbrFbNz4q;IDrAqldrv@lSsG z%YT3L#pe3@^5|6x`R5P+)5C`k)5S-B{nx+weEIm{59)gGu|D)y|MhhrdO?eS@Q1$B z{qd^f?R)?4p#P=3?Eg!f_Wu7}9A6&(c6)q%arv=+wucX{FK@Q@4xenVFHeuJ4j(+; zoL%YPXBVGc{o>;0dVBkd#OC7i`lFl2k5A8kvE}#OyTAHF-_ic~KF86gr{^czFSb{2 z@jKA_*1iw^pXS!~`u|RjJ^p{MkLURR?b(65^#2?9Kew9G9{<0KV~_ve?_+5Fhi~&c z?$Uqsn`4jv-s@wh|IcyY zo3{hsAOH1I_xayklK#&<)jj@u7sr$io?M?k+5UO%c`2!{wI9`G zt@E-v05@{vUmMwY|PRJ^$=YazN7e=YRHj zpZ|L|#~%N^*T+lx{~QLsK|64l{=b6%+44UB_fC#I!h64um-PQFc>g=~KUWFiOa8yL zzW4v{;`s9LdXueO`Z+R(?{>aBJF0Wtp`sL>Q*OIPZ+?>T}es^(sa&>s`@cP-)3;joioFW-}3w6gQipb?qa;Y#&vLlpP#>Q zo42oheP#9T$D89Vn#(z#Y`*yA=ImyB_0jh7FM0j&K~5>w(f^LW{^%KR{-JJSbN2M} z&EZ!%&C|_ir{|k%-?fztdv%Vaw``w0+MWm@`0`J-k2g1G*Zg{QadXL? zAAWxGnNIQ<-=uYUa&fXf)2lyv`0%G6KKg8X|Iy}Zdw+X=|F`+5>F<}@-|p6O&i!~( zm-S@bF7u2VZ^}|2R6|JlP(6_0@ytul$yk&`baATp!<_pKyjh`^rlZvj5|o z>x;*ae|8l@@TW&^{zm-wWP80i*<5e_!0}(&$A9nU*vEhG_3^Fo-y5_8zeoI6r+xhQ zPL6&2_kJHQ>Hk~s{@+3W=lPrc|NH*GcXE7rcyp>4Wpn{L2s?eE$mlYX=FPrl`yH9ctUsFx`MGZ=lG8J|9Sndq$#Dn{=buBkN@B6<6H6n8?*y=>i^69pCacm@A3b; zI9|>FN$Zr$I`8>EfAYsm`u`TZ{~h|Dvwm+c`~NiW>p$=0`0~*3f160EZ?BH^yY%z& z^7LeTesCKU408C_Q~tk7NBQ3VXaBSR+5hZ+_CNcd{m=eq|Fi#jFMj^+tFL~r3m@|H z@Zp>plAE(Tf`=~$j|j4%LWy79Pe*0#Yi&#EiU)fx(>=Y`%CgKgEp6_#EN$hrl-IR& z{Ugn5PRly`oq4HioAOfDNuRF0=o0mrb?sAKCeASNQY(|**h}kWD)T(+rK-Slp36ej zX~~OTnM$uU>$O%_-aF|vg{+ERb&WdZI?Xjt3tzjo)R(EQIZt)2)68pAORM78o^&a- zDwgHrOP$xwWRqoT(3|hw&qp=)dCpT>OD&}?jfIf~_)?a8 z`6x}j=)Ut>wO%Rtl6%$?W&KO{T1w|!X--qtpA~)gIxj6wmK&v+!rJNDs&B2YiofT^ z+?A|*=YIRDn=iGeqS#$a$Y-?zebP5*{qzZ^ebB}e-OJ18+P}kR3S$|mfuB(`rGO=}BwzoWIZN;oh z*Pf*27i(P-Wu~nEuZ`!FlNO|D+w|_WYAq37{bR{mhB`O4)GMzWmsIqYa<0`2GfTKu z?VP^2X@80@MQ6~;ui7z5nXcO_C;`=8>jc`9Wm&VAx=W%~y;1vF+hpPGNt>#TT_oe$ z9iMSpwMJUSRJ2uT&0YrWP18y*2u#-v+=~@l$jfQ(brH?yTOf{^N_9riwwsLK2?7J3vPQ6OV^(Ry}E>T;72unM)I-T{`iKIalbzf@i z<=Q16)0(6xDK5Gc-Q1Ll_OeJy;wyC8InOJa*hn2&g{5v3FIpE;Jz^c z*59tHSXh&!`{{f%Ybmw0x{9^UND$v3mDB$wiG!}z=1=#ov*;!5PuD4|g!9rT35JZ0 zmeeoF#*}HQ1)Y_s^HM`>B>>tUsU|x+YoC{_Ta?)9#w?Dx7k!fitMt^gTGllQnXH_? zwURgE3ucMnB=zX)gf`9WmkpJczfTgLHH0F~1sT&y$totPtEES;=xQWQUHFwh5hc#Rg8Y z1ClW>Ws|ee70G94(QWfH23yh~UC`HAWV9%4DiUC6f005>v26N%k%&%(1!;a!i#8`o zoFqn;C5H$~owbZw6J2r*DVG(LF3!431l>}jjn-bu0$@XQ(_Y?L3L?v1WvKN2?C)Sr z3#KC)_I!9!?MV_4XKSbZNl#9t68p!osmo|x|u1IQ`e&9(E2sJ#`t3G86VOT%H7oX z-Z`;;(z=FxcXmVqwMxn7sTHop&N?kx{AH5TY2&nvt39N?SSr?7FqwcB=WD6hR^!|P zCRl^z#~X&P*I|q#*|ae)UWnKcoe;w&&C_YaOiPc7-?@@WLOof*vF2T8lq_WFtyDypBJ88f!g#iwP3m}c+1OcXCnXLIku?(bSoBHW zA3`aU$6KpZxJZ(XY`9syrYn*YnY~WB1W8)M$)V(ESC$o_5FqPNdGBJ+r*-H1Cf-_Q z&{#WtvHY`iTd!pMIGaVSmAX0WnQ$E zJ>){52{$F#Er2}x+@?743#+)mJ!$$0OsB%&Xs5Ih#c`8Hca?i_tgF2+IJSc zRKamUNv1a%n-(9I;goC$dCK;BYk_G3o#1gf z^x?g9GxE0>C3!JjAs0?S zk|t@awp+Gn_O}YN=#mJE8aTt4SYn_Rn{dNfHUw*-59#t`D>^W4@y!Y{N@QxmgXc9O z1Ax9fnlS;PM6v}TSdboeUK?moU%Kd8GB{4|F2bOMF)w_JtSTa=t;ratOig+K7w|tJ?sodMPI%;`0|%XQnbJl z6d7N@pApug|H>#fmMau=kvo!IAu#0{*KqcNKeGA~tb#+0yEX7lK%kka=A2FfSFj-x zQVW!UuDrK&f(6AEvUs}U6}ITJ+_5KH!C zN$q_c6cK1OVwKs^G1N;o!jE8Z;HEf5mZGghCo(*-C{iRMzCy%fwX#K)h~WC9YeX3l zi^%x@~6dp#*%zFyo1cq1q706M@LZ)q=a4lXFT}ahU>ZTh;JSUoHn?7I*P zeo25tP|YASJKVuZ$^}<@<%Ot6L5>gH%g0RY(r`3FHj zEwC(e4alG+>X^=gKIX^;kbsapuyfuZ-3?9)Z$zXKKD2_}R)i&y3_+6{A&#uHf+51J zFcpHpgbJtyaVjc+gT;Y&b}sy?EQWmivN|5iyEtqjGy_S3%4n7yve36{ zeP!)s9M+Kw(c;P`YTp{`IUs}XLY`TkWksQi;ekMq_7K=5L0bcY$%$dl!8uM}=C3D(CEp=ML zlnE_Ee`rLCl3cq6G9$Pp{gL#PZ$l_ca;Q)R!;CosGHGAPV9kgXQP6#(13@F10pR42 z6>X~wC5}M;LpSHEDPRma^3J6;Lr&gdCsG(-Lq; zxJD$I$Z2&HC=XJ4AR+l^on0%4nHo3SnQep&;ni9M_kbg9uEGhKe_xDID1dOuj<$y| z3;qhD$Otb4$>WT8xETaba%^4=5Cl~hoF0g%M5sNHpb{Lcq~B!w9nTTC2^&Zj6c7x1 z&I-$RBMjs$hi3p_jb$dwge{O+)i05&lC6-e zPvlb-c?a%Rd?z>umM~x$Ac+-f#EoRj{80mfj;GcKy>`U4lG|)kn0kOD5&>Em!HYgV z*;qOm)?(D20mKx?Y=1RuUU>M4 z6hKBFZFbfG!jwWe8LbeXNJ7X0>JAbDQ+wosv|Exn;&Pd*W~}cr`couJ^*Cy>) zh1@l1#f0bM!;YjC#mcDY`rF8eB%*iS$rPWB9F#tXg%=8wO^qB`lP{MR6|L#eDQf_c zCUzr?Qbw{X=xioGSmvEg*R>!nXj^R~Qo{S8Y1+BT#!yxUr=h(n0BGNEVk?{_oXAv9 zb~Il2iKtK?(_6wX~%I*~uT0mSxY`@PY)AVxJ@Ttc#HU=Mn`@BK}6i zq^~l!grwoGfaSyw3am(eL`p*jX42L>9U}x$LQ7?`GT{koJ`YEXN9@>JDk*JbIc9P~ zILS=PKD-HOt3)VFYQt_ff!QISGAZBd}?WaI2alTH%| zP>V?FCt+mU$YK&zN+n4#MfjlqPFSCAMD5TkT&^TiXhQ23+AZrd$x#Qsj7*3nLZ6kG zE_L`NNhayULWtL*jG}ud&;YeaO2d9oq^S5^y$WWwUaf2kWg^n_c_8sZi~{BtmPZTWX3yW#CiAB|vAApSsW3MMR63(qB^oQBro0VWKF- zjykC9x)G^oiZE})yy&fNT84yT6ra^b%P4_RoH?rDl?u6ReT7uWg(N_k>^sSLDPV+h z0QA{KG`**Vvyp-e7-l%)vIu?~r3t)NB3NoxpF+-&=rnVJdH4qBsM$K^m$c(uh?5sy_@e{p`oHC}nPvj4YBJ&7Isp3lRlFJo!n2bUWrh*=I8-@h>lw6byMDyDtnccPH zKu6u|O4Q*ne+7+WBFKV7_EeuEeAAb&XtlRfZaGuR%l<9p#JUeKb^&u#;#9r`b?9a*iwtiNi>h z6DP{_5Lr_;nTK2~xK%-fN_A$q#~DvT5VuA}i(+*q(h`_1b9_|sKV|_9B4-y#@Jvv! zwE2aYtWnM)nd{Pe*h`X}IzOmiQQ=4@SHK~*w{t_oDJ7!j!h)n}REbCmyQqjw3Q2*c zU~ei!0Y;c^1d=l z3Jr!?)v77pp{y=VElU_j#g#(l=s>|+8+Yq+sos(k0<>FuI%0{xY}u)=K>LfOoeYL< zInJkWkI0pj&agFv??nIP)MmpWSO=^vg=4)IS4Yhkph?v=vJK6V5^2Ac{mmoJ5>{|z zQ+Cmu;Ed23y9j_OYm^TfNqBi?5K^EGtq1bdXcK7jGSV#*2uW{~AC)<0IS~$|Sb8J< z2rHuvOO7_zj7fF|ipU#$L(O6ouaPgxjg;zYcy#h?4I?WcsR$%O7HYw|5v?xS?IG;i zet;i_r9}Zr>IYFFlT)pfTn17`Lga2h1Ty@Q5hTx03AJU`T~f$Iytvzn(aa-QkRe~4 z9HJ0EMb4TCZUJME9S^u0m`}!+*283j3uyPV7Udge^Q0=N?az1cIkQT5b8&`?! zKw=&1I7fV|?Psy{HCBlMFmHuxSBIoI1v zz^GdQ{uSnt?#mckOJ61E&^5E2#mh{152qlJYSTTBq~)|2X$Mf4f>9y!rNtK%1#nZXzcLT>?}onM?ydQtLzqTp@Mqsm-XM-BmxDYYuI@LX-Pl^PYdH&aJ6`H`_=T6Io?*tEHtv76H75b zrKD7Gf&w*mKNdjXRxhj*S^eY%%`S%Ojwp~Sg0p-FMiw|H135c_kjPa)y+F5gy$>)| zNSDpHe!-X!W}Qo`0_lKSel~&0A+P?e7`H?kN;g2F2_!p_k_{7C$c|9MoI+4Q+}xaHCi--nN7o}*zRjo*5#Uw7!?_^#OYd$JZBL5HMC97m5 z(o%r;@rbfxvG&>dD6r8C{mxZUEve*haW$%&PQ!SP1tuC^4RCnR$vx`a_$$duTu1{* z>Khb0u&NG8{zTX~LLbdjk$9b8H*)pt_{h7GiYA1Z$$A)xcX%DL1ZG>6kf%is zl_IuQ62loN1{x^YCCQ##NtD0wqg1{K)~1MfCTicVSyHcxY`_ftPgbXqG_Gk7+OlCX zY?O7BdI?FJiAQB}cJ-bMGu1>UhpZ!en&2kM+f%VwsW~dNBzrGJfSDXcJ*@~~OEL2- zlhpCep`vIhnat$ebENCPFsBG)NjQEX?K^L3*6w28Oo=He(p5`cH;1GhAkZgGS69 z;!fpO$cH+tEO%N4&Xk#!wnWfy$v{K%8e}}`Q4`)5mUxaRfPfT8qPU}zp5Qtv?oPIWT;f_wl;TOL%kxrkA4#G{ zf{S|7M0#LeT{tGuBN^09X{{C~4kdlg^VE3&ST+W}q!dz~Hb>Tonjtiq1W_bhawS#m zJO#xDiK@QAkz}TdDN=8+Nri zft$<(q#y$wC9Z_GnSwV{>k72dAZ)IrNDI|CgK3hQl|-)LnpdtQN+1h_ZHf>BoXbLL zQ8XOMZ|euQMgYSz{+++2ih#gY9CA!dOF;04v!H5ewu z_9-Hs^tSuui;u9G5HLnWRXy5Ni6Ri`M{k;sPXe7M=$0vvfw0Ho!ljU?i9py zZ!5V5CrfExpm3C3u0~*by{*$g0>Z_^1UG&k86sATh|1`6hRqoy8eN~{kqD~WdUJK= z2DpnqHa!3ml!ZoIE&-f_lq82E+_Yc@MtX^hNDy_vBxPg`%uejE>7(`~H79%H!fl5- z}q44BbT2ToJb2@-t>DV3%kg(>(z zj=wej2np~|-Ox3e(?5KOn=TT(@i};2Quag@NgG6ag$PIkt}VxDan+m~12qoB zVdUb{CXhTxobcvkOm*$pW`BjgRel-t#x-SU7$^EZ*iM6GjbJTbp0dC9~sD@Mj? z`V>`nylTynH7QOxQU`+XH3eTy>uWxt5G2(TmmDGONE$PSo;-{<$r%YDD)?nlzN=`} zOoR*dy+$w_HB6Zd%HQa-qhoT+%PkS@USzk~APPKd(8dc7S)hz*5@`=ur-;Z{!I&nt@aRx+$slb%_)`RC?G;R3>0&gwZ5}fMEu4m8f;% zwzb*Pjf}X4DF8KA12owCk+z#0PQVp~xE54*ctsL!%A(6I(I`(P4FraD*anfMw0N(C zc@#mr1q%i!MO-pgYw+`mSq$295ShGw5MmNIL=l2xqX>)u?(^VF=b*5!RC#4L01@V) z;CW{gK_K0sA>PYm60lpbQiNIBZ(;6EI6|Mn;L-q_E!`L$i|z{k5h$YST6?r3pn@}$ ziB4M6q&OOnI`5SAR^V#5#@P_$8QNCRW@apOy^8g|;X=^2`aiL7QM=*}ldXZm(C z?#chjQ50woE7)~3M(NIhO1e3Pvb>l>|7h5u5YYTtTqVw*-ic&Ha%&xAacO3l2^dF2 zN(RB?7rk7f*aaC`IzUc-3eYRiM{x2|pW(p!fc1o1UJA`A(2sKC;1XbB(Q3qVs zNk|pspor3YSwhO_i6P7dTLq}Erk?#29}2JLZ2#L&Raz+AXA}TglOstlKdHH0|lV0 z5b%w}G#R;UYWk>Dk_9AlvA8ALB13v-0cFdtMigjU?5wO4MxhO)e`Z)}5Q+lALZKsx z1}V8ju2z?^K(v!hpf(g{x6ye?(!4P2H!>a+L9oxzm)nSlfcW$t2r0{5g|-G*pdJBv zbkqqdP-Q0exh=xc+_;c_lmjhsABYMzpCCjQE);}7&?E>#=IBfU7j!o}n?*6OeT!BS zDr(H$$Jm`y}bu-)g~DooVY7KdLW9+3mBRIqUALRV1>5`d)(3=Jd49Q!L{$xRay z%WRGkqmdYSh#6I|5p=#NyE7W_8sSo+Ll9!6{r@gIEhQqAIQ1xLaTnP+a+kJ>*$Bm? ztPYMSBfUfi;Ed=}tD4Q#%Fs-qhES)Uf$h$*O6VtUSl6^hvKLE7vvzf=V32_=kWgRf ztcPC|=B-G$A?Fa~23d!IWrlPkZvc%n_PE15lt}{e)sR&wILL!~31P`#;4YAr2|3*~ zgaDf;!XkpF(&)yPW2N;mN1Hk?6%!oj+6_jk8PXJecwnU+Z>TTQ3#20B9mPI~4ls}~ zL&$JAVb?6+uTTISeaK|yq)<~{qgR~YQ3qHGgx7Xl)Og{>PdD=?qP2#pN}JG52Wja?{Qc75U`l`}y=9z7x8+ZoEj`kX(IX(a;8n5-0t zhFy`>LgZm?3{I^lL;^x-Jy-|07uYwLXGbaDL~!KZwbcf>?nutf1#n_rU8ALd-d^^D zEVC@2edgj`#oN~Sd*)!n()2A-`F1vzeo{!6ee`6KIwgAnZCN{KS{JyKiY5*`JHv#7 zXRL$UgJ+vBxCj;disbYX^=iFmVGcr}aeNu=mUORGn&<=^gqs2h0O+ysB-=@h4YhZ{ zk&XgVP|*fV>WCKE)oT^~uP~%aGXc4bPGhIOX*+MH}~>dB|A_B$}3hR zSYcHDbvH20+|3u84uB%T0)MPUu90pH%x!}E+5?7R<5+VuTY}t5GJ*09+=UOqc&Zd+2p6fuXCxf@ZdMcZ|m1`9AlK7#)3sA0r>I01THCubS#-KkLs=t0{h zCTS|hm0gLy!Z4zGzs<0y{E!mt;^wz}K>j2%apRGzKmx%V>G& z5R)go#TtcD=7uy3eTS!8289xW5EB{9fZ+k_J7xwXQu5{N&Oq1&5j4?FPeRImfy9Yt zk@!OKY1RW}rIeulEF z4zO1|88uY6N46UYij@g}P>7@1D-(S|KwuC^V9rD@u3lf<3hk^hSv{C>Sz&$2HK zzo9JXUz|pORNBCL@CKrJl)piGLBvOpXzuCPCwq_=$rCOzLp2-Od&Z8CN(UTG)@=Md zy-g7IZ|MvROF-M8?8;!?F#rcbq866+MW+^MT!Zi4O)m}SJd@C9o=gBA-#EdH51PN& z&blMx(|duHZbS|)(RMfDP02jxu%Z(M92ixd`2tHGlNlgCxxhes!xF$fxiLCtL)|)< zsgrLC6TajChg35O+;tTVjj;y?H#r!N2}M8l4Fej;S&5WQDEwF~Eu( zC~zr5^G!ps7|TGb1a*0=l|6fRWuJ*yV6`#HWTn%ONa z4it!+{Em8+XuBhFpt9OrC@*6k0y#ITNi$;#2Ca~Ef=4Eh(~gEKrc5!zB|)^_wMQT( zJ!-g1^utp*>IU4!UX13Ac%(N)TZIo?o zGDs}3YfIXV7KBCML>$EL5uX-#c#E@m^z+83I9zFNG|U!;OhnEiJ7YmEB5Dcti7wwR zzPxB6hIUC8kz?vntTvzC0T&Qs^kXtCi3wBe#vsyqADT4*Nw7aCCdi_E)PjNr2S9>~ z?#L&Skn~tEE|3KrfDJ@IhDJs`J1)KNDpnX47`h`nzJPM%DHkS?)H(!G%LvtZK{`f1 z2jk#odO8Z`zK=0K?!1BXMuB7SoswsC(QFFwG^Q}F&e;;MleW0$Mg{C6rGL}v`_Kd? z8YU{b^y+(=(LZ`4XfWW^q0+Lyt}d@I8zTh=LMD2K2107zR|q)>pY4$|gMqiu3$l*M zCDRmy-I>G{y~R}SrZFwEBL*alVI{wbHtw&!l8O@cB zMj&LoPdW)-kAQ-LY8gd6pg8wLLuf=#kejt7o=#N8fRzPqCqn|5Jme_V7DoS{6xKY< z0`+CaD|iHk?t26dPAd@e!I>J~l49YscqA8{tp+PJxDX>12GYb}q{(xzRw{%*jC;6Uuj4nXdUhFi4S#e9BtRa=>3GIY9J8@9w$v#>CwdvV zlxSclzdX?5N)--tOu(|lg}|L5@|M=sQ3k5Ga5^>6qcIhS5VCm20F}Klw@hn9KC)1Z zscBd1r15xi(S0JPanEB@7D%MKdB;{mcd0HEE9t}=^9j0nithLxmXRSJB=^C_yPkit zVzY-zhjk*gwK&bYu#KY;g1$6!uaZlHgB;}!F!Gwa9*PQ=T?T+c;TqGQvx8F#*-T7= z4ExVV-$nM6>5E7N1Js5Q$wjyFJ+;miH@o7sl zCr#0B;B>*~?15aF9bO|%1r4WpE_I3wapzl!Ly^l}OlMLiB?VZ&gGM(4o|!d40A;(Z z<((jdO*C-sMvOD=lS<1(K6H$2KrCq+H~S#~*y1T7)NMU`Jwgc3J-k{npOz={zZT93 zjXJI^xv2$DGSPw%ku>F3GFeo=r$HmrJ7^~tx?)`H$j@s7TlZum;_3kDtQ=Vk#ym$g zf@UEZn+o9HWj#s6l8LI@j6Un6X`tE1O?xc_g{lt8emBF7v22tUItw=Wx@_P`%T_zf95r_) z%9BAPc<_w8SQG(JSvRJr!&m6L6Y32_baZ(}A}mv&;zfq?=4s^bJ!{uv>$Sn*@jcrM ze2SwRRbAU}cbQNeDq&eXbdniRG0VqSP2z}IDrB<5qA}D7#1AlA2Sa9NVh~`Q2*zMhQ#8UoF|UsLGAT!!otV~5s@mYs6CIqe#tPET-#x3Q?s{=Z>e4Rd zi4D;kPW-x<4nMWgQ%1!O3z^dxEO5{D!MWP;Q&cH|@(K-G7#omUco!%Vm9FXWD)keawjt&g~Ap2eY$w;cuGYf8=J#{Q{-QZLv$FC#XZ*W49&t$mZ z2w719?1n!DvULRB7(QSU7{J6Y2RF=0asHo{=V>IP8CJ~jr;7cW$K3J)8I=Z|Y1e*H z=|JaVV6;%EVHt=X{7`^Q$6~ZvXZY}pj>GLvwj3)RC=0e^ih@7qWB981Y&7RCK_{n2 z3%|247XbqF_-2p;o!(LUvY&|=Bx~nrh>;nHLM-(B7hn^Gb#&i z&KXliGL3ogqnMf#88K(m;5C-p3JiqNi6D~rsE@wW?pYF;JK9VfhGrNs07gpac`ztl z$EWzNtE|y4%qSNONH&|FdUvpF8mR)F2*AX%k+p+=P|YkQ`U?ViOXj3WDrEZXRTP*_ zbi})jPsU+nEr~NmXP`BQZm>1D2GLU*$)t%%_mDT8YqUXf3wlS@2|5O6syo3(OA3jI zfx<{%P)G4Y8OB^SU^AUZ-DKIQj(e6@sbf4g^6xb!5x{rNx7k-X-Pm&&39^wA`k0&Sv5kGS6g5auej0%;D(a5V z1QV0eNjBb^P+&vM(V)pOWR?Cp5ISRJ9aV=mVvfPI)9Oq`Y&@#%#}h0uq1oTG(vT8# zQ2sWeGb&((9$2?W;_Kj9Fbgnw-tibkGyAcch14oDTO+VwW+_Q+61Jn%0_zL-MX56K z+EicQ&{I~R$=KQrze4*&VSEsiD1(#pfhS>t9LhaK57|tkUVseX3Rxs(%Lji6q+KbC z>dr{Rghc|W&e5F8Q262hWo_+JJ(X;Uu2u=23l4=wnkcY0P?n$1(P=-5EpeMLiH+h; zfml5tg720DvjHDP6Z|7oRPrP&Dcq<81%%3SJTIjIS{N)#$s_P6CJHytgqyVwb~D^~ z-a&~imfiW2+4DnM4PyyorMV>>vUE-HrHMAu?37N_M4C%aU!!@sMay$~feAclB51*k zwIqp5>L|`v7%59-lIdXbPGc@o_tX*+ZX>S34O4mpdbsxw)?jmx;rYU4*r4oT%cS&{ zFjvsHAsE7R7}~aY@)1Nw+T2FF3WM4jPo9~e8i!P@W;R0blf@fDCSlYuM~ro-IX4uL*>a2iAXVAyvH2S*h9*#Xu~6AldKca-;qjl;S2!GZ$he4bcSyoW`U& zr?4b(boY!|RPd2`@Cy+tld_|Y0*eJnW~J#mB12+%Sn>1>muc6C-rZZ;W*TEQrwE^!bw-4F&y*S#g)$}Ow{RQ8)ZiM1NHRGvQ0(mH zK*q$l|0nWq5bKu4aI=A^nVJj*5!(A06X-D(J)6XN3TLlPC|Kt3rOX!Msa76r$gHA)W&B_TV5a9(S{!>k zZ-QsGM0Yw}mo%tV;_W<*-WLY0)7^u-8xm8MLLupzqfa_g8ss}&9cIj2@mOm{W)}M2 zN*)ueXU1mJsFYyXM((`M9>>mH3CZ->9)~rgB|33HOreX(Zcy#eN_%j)Xz+3&dd)j} zD%~cNXUQ;*eiY+`PiA8vsOl#A_|aL+klFRl#{?--=58=dga3;?Vhlw=X-YtIumClB z(@Tk<3;lhx2QeaG@O5IY4zAvf6k|qNckY7NVK)E?qym$+FmHEPQpGazG%9kcV=@%; zc)+6#uh(|-dhFqx^G)%P98wcJGKemO@rV@Dl$$$6sANRnC(JeYZPRUxK)c`MQSVFi zzaUnqp%;-fRl&1TCiR8GG!SF*#H$|rnQ*vg8p{6y8?Adv6 zCp~9BR>8@L-rPlv#a?5pCO@8pXL!MT8dC(I+I!yJZpb^M8z!Ex0|%`|Jky*3;NfII z;I>lvn zRpsQFBVp2c!UaOh{dUOJRYV7H4|DbGw4U_H33*QjlB1<4=@dx8w>|x z8i)WNKt71ERr5TCx0+@HWi!u&+_suvP-b<|lIxN=bQ>xgE~D0EWUg^(OwMk8P(sY4 zpSn@GGjgLrjRZBFR1;=y>?bdAYP6BMQ5*&m($&_t1R+ReI7CIFdD z!=x+6^8mF<_sBd(&_;tF)OYz4E~kxAON8Y_8uXW!(dP(`gg230oL^%=C{-%=q|+X3 zaJIU;4Lcr>K#2?iYs=V%AkSHHt{qstwU#+s++ zBWS5!%6_!n7?#P61s?7@%4cwf>1>3NG5+L%GbugT#O9@013X07KB8y(nAy`Kcr|B@+r3w)y~sxHH@H#Oc=?ZF$0r=t)IK3 zn|Je|BNig~f|n;1=c;kx*-p&UNhu}_*#=V3@htA#>>`_^vNij;STTz<9{EMSD)-%7 zqhzWi95y7yjX6y9psSSh9EGRxa4|xYw0LwDp2j15a*Vq$Xy4IB@ZL1HPcKZWTbvSh z$1>1uJcy*ZN0n?PEbp)(2r+J~FnK55NKJ{RYit9p$fH|L0U+!MSY+MfiD&~RX#xdi z_sllpcrp(3{J=Oz%NX8UaUt}Jj8XOsN~3nfB#vmOhTg_g^XMF`@o*<{WALn|-BTIJ zS+5@S=aG$Va{|)3)n7VF z?tc(jTz%ubk-mgHIC$JQDmUPIWM|mi?Cm^Gxy)L9!( zop<@hk6#@Qlq_GpGIj00maL5tMa`M@zU(A_S?C_pqPkhdD|wb*{^E>iUiR>pO<@XZ zu4L!z_Q6>zrP&t=_Q1r_=9}kuL%V+qf;34<6&0rxfNCW}9Ri1ImIXKdcfdN`vm%pV zK*GfhtkD(>Yspc<8;F^TP(HMn z0wA)_^G9c2-*Ui9KbN#&L+(af^jVnb1XbzIgY9i{z0ENB_N}^)CnT`oo7Sh*A-#y5 zt#|NH7nf;flN@hSN_BZ$kYNV=p0%9B!Wo_b54@EGyFpO+B@rIv02cx0Q?yF~Wsj^z zhn+@r)G^T3X&6xo#yXWdKkIiI;A~ki(gr$8rcjuV(#`&m)MKfZMo*%Z@*vWy)cA%H ztmNbA0)A*;(hKmYgpp=$Eq&ak!kC&6AG|D&V7 zcA+evgb$bxy-tl_leG9)wPb+pg$!zhC0fS{Y;&WC)om;(wE{aESk#O_o>SARZ|lDO zJ0jo(x!CiiK|FFipF3igk5}eXV?8PER)fREpr6r)H%qczTo0A&ga=^Vm{P9ax*Tq6 zxd%} zJy!SDpjcl<-T=Vq8JPmH z>locxB4Rm~@&b&*pcDWnFGiz#q94W3T~32JyxcHi5GbwkdXpUI8XTZy*f>gKICllJ zo^t`KO$6{PSWvaUjF0@DEF!TKy7w$v&e)XU9~oljmS^D4<6c%F?iuiS=o#x zP2%FEqjMg<@^p<4IXS(dyOeqBFzY$5kbNdm4sGIQR%WONr$dDK?(h3ij_y4N>MigM zZ8kV|x=nvq>v0gwC|*wYfH*J10I8%W&1kT|o6CDrMsvKOdg~)8YO?>ioVa1I0+#XJ#^2uj3Vq=@JtToCz1zja?Bd&P=j$u(frj$Y5czf>edNOc9WJh@X~RFD-UW%8*S z|9HM%M`VrvhUO!Wvv&lEh^J4Rb@BxeT$%?pn(3#T6B*W7HEPUmK)e^Y48j7f@?0)QMVd3634*#t(_Y3h}p1@Y1Y zx9CirgD?Ok*{8JsnUe2mJ|$!tIcNdBD;mT8F{M7Ql;P(BCS(XS<_5L$+usiv+LS-49=aj!Z}w;n@ZxO`w0DhYE%Cuc@LT z7uF3-QR^Tf_y<#R4uN)? zz!D@S)M2hgxiq!gJCcitcHQ;mpf!E%!&JBHuRoyOa*HLCUW*YF;s=hY!Y7nC^4~h| zCTulD-JQTdgKajK)wjb{FsAgnG`7gh1uMg@Wo#KRoNRPwx1{581Lao zzd1{aDc|1ry=i_I_%9)PP&&9fwu0b;d7G+j>}tH zsw#16r9gS>liV4kNqv$>O(CI}HQm^Cp}3~!S=+m^vHlOBop&Z*wy!9porv7N*F&g- z$(!zvud9>gUGTZUYp9E}eJfh4Xz^oeLWisM+ndi@B%0g?ZEgbprE95T+k@R71lrag zi0XTFuEO*>cWP7r1I6(ZR$qFQ$^Pc5>?>tnG5h5jvJpdnNKfG_Y0^;h%MOn9MG?@G zC(-6n4%Do?T$U3!UQU0LHIX>jY`;}sN!;BeZuF5A(BG}HFG-_X2S?Zx)sSop?U)}H zKB3)VNe*unGs9a;kNs@6?+8$^h1|Zk=d4H06ywJSCyW%@92&m+i-Q%Ej?fVJ)E?At zLy0;M$4f|feTAOO!TBg6QvE_TWfjioWxK>9om-8_u_}!^G!Y(FlZ9v{N zDiU;Ma4cNwySb38ITSXLR~I_S00klT?`TjxBIj&^NGK=WSb;W?Y*M5LOR+k@ NpNa&;8T)6>_&2*9=-lvq|7(Va!JGr;MUM{hpe~0gbSnv1S*PM0Aea8>+ ze(TFmu_AK)O`T)jCrQ1q@sDiXx$}>3|K-%bpN045>VMqf)!kdCKOR25?(auF-oV@? ztLyAn;ZwC=Ul-q=U)K}R+QWT!Uq7GcBkxPtipb~Z=Sz=6*YsasA4@|sGb8T@7Zf3) zeeXQq9{-+3PWdjLH@=Q{mazL|<{r=g&i(&lI6PP1_jcGvuoV8u<=es6-QCCA{af+j z?tN(}J3ajIF}iy147=}RrTMAn%GJ@AzPUxnwX*zIeY!n9qrLC6MqBTr!Q{W)^6$ja zyrni|;>`B8*R1v?P2aAWBH3lj7vI%3^;sqF>hvmb%k$=Cw$XXr+B81NjR!l|sqEcK z^R~2S14sTbjX&iv>J!y<%H`XaR-F#NuXmTWZMIzN>|6M}>R(#s^eh)kTgtZ6;~nj< zR3q*8Y~}>#mMq`Br(6+834N?fTWVcdi)Vz&K^p#Ceo&p_``A}H0*Ut~J+ku4`J!+5 zZ{;sjlbF3N7?oH1-0Dl^W>Y1rWd3kd?ls5Oj!mTK(!Vp+`kZzBn(gwu z@x1AJU$mf;*=Y^=twGOH-79igU%zf+WJo4dhFHl~ilTc^BwBWO(L)P;#mez`3@7A_VUyq-zHq9hg#K}CJ;B>vo>+8Js zjbRuyTadcfYMR8R->#_yR`q8UwuO4D&64>!Mf{pi>EFh$*1Opf`IS+JBIPL`u5T~W+vD6 zm~{PDCfIy59(|Rwg?YZ&D)QFA52$M8EyxvX(yoMz!Bz5deV?9Kl~-lFsY%#YH=mWH zhF`kXy~g(b4=&d9uf1cvwuR7M-?PpY4thmxV5n&xHttlH_2|H0yo=t}^(kK&z(rmO z4J}g_#*Uu6gpW1_jiJOWfbBl)N#elCVNKK!rW@952 z@Rj}DYE{L?)*aJ_SQ=%IBz4JO{ji|A^?m8jCBrY-f9&pX^atw}sYc^xR=}?`JGshM zOgD|9SzW@$R1hnllB}z%c_BybcACbOSZ#twh7T%Ojm%2h)*4$hHf>&CTX62b@L}Q& zGJ+=t3uOa-8%oyimZ^7Fq!=U@T|%`fH6}@2I=`Oc>LeE-INHg*z#nrvabL5lAM&^5s43jXM!0ylS7G!MysHvay z05d3E(KusrVh}VC12IUZ*?^xLON5lxDwU)r>qJ|P$Tg^@= zI`i1}ljsvj<*U~PeL1IA8rYK}<+)YGc-pjndyTPa&Xr6kJwsHfQ=Ks-< zzk$IYd9gNsRqC}*$JT6UQRjVP>D>%Mv=uO4dLrGztj$)F38HY%K*E5jE4kGJrYdRD z^Q`{9h)D`%_>&rSrh@VBtcfM>%PN`5K%dJ)`nqJekhDoMN|4Jp>8}`#`2$Z2Ff^KC zBN36i`qfRsg>2}uL^h$sJ%>Oh&cj?AH8SHx9A48&POoJ(_qLi~(`;Ug-?#Cz*-iJ> z>>p3IL~UxX%Z;)13j^;}NDQ3aUB?D#jG9W zXoDGfyM+Q;-Ml)2_L%3$!ebCBms4EgG82ZJ0D7`K^zEFig(gWVv!S6lN&^#2K21xq zZ3FyZ!CZnO5zVd=!O)I91WJLR%rm_qu_7aT6fCSV%}wD@)Y@DX#I%`oN*020VgAC+ z5H6ZFnHFg0STA8*F=A>6xax~H= zF-EUfQdOJ_%PdX-;Gox~_r!}P6P}f%6Qa&L8;@7mMR9_*&kk05cV&!_UjqER2T9UI zL8oHO9TQ*9Elm$@*xEp=C)2MRES_x7YaNPc4# zQ!tm&9~DPbZKv(h&|-k=#wcAqz_Mv1mXFI=j&$+}GK@bM^~{Hi%4`IT2VVwIK(3sg z>cLeqlQJyJ0P!9W92sQ;byHq-Uo;eo6X$6v5*^pVl}OAnVajJ|mpGLWfyo>iTa3CK zpS4liZa@l&T%IA;#$f=xhVzg%7ldsg^7(@qJ(cnHNw)epO`8(H~623t4V`S*p0??r07g<7xOkvg)V^Yg@Kapkb8Q3)5tDVF_G~7TvA! z;6FaRRp$A?am+g>6#{RE%uE)iC>S>vd#bW?LE7)^*(C(0C?q*`56G-DDq`i<%kf5*Jrt`??`Er9Cm^uZZ;IVWD1L-39DL_$zXPDY1 zGn+?~m$MWhb~OS8K}mx4n8h#Y9unHw2aI#WJ3$c4T_Vo zDJI;j@tRz@Cngl8#7D7FhUMO9hVe*OUV-J`*QJVWfIiq}d&Nu_==s&ebk+i~jax^a zBzMxn51#@}%T7l6Zk4M~ACalUO@F;6Z^eIaBbek9L~dWV2%G6|7*nhSCK?K3wk4Z^ z`AsL8h`KACw1#`{6bwV;Bv%kDXhw!HX5}=?2*`r~P&89JCtiCBNgbB01DXm#j5b^~ zwA%2ELDq2M{9CKoD$RJCsQJQUvArF{Qd=Y7RC`cjYZS|(7#0F7U5PU9WsjiaO5J?> zAyx2p4BI{^`+<=u7=;Orl5T~RYQNiq875N0e76Ikx%yGhx7}x6yERTK-Nt%MKT*8*{VNejntbklb+%7 zs;2o59gaQWP9rD@8fr))F)xl{3(UVOz=I;oSE2uS9JMBG*{Nn&Kmb8Nb#OwlU45IH z-PmeIlzpO{d`%5EVEV^#y}=aV<)Jbv6eKA~y%1=>EWFKDQlT%2p33a}UDp|kLx)tm zwiYakq3%jkCMT>y-ymQ};Vby1VdSy{A+6gfT#t^~Gl*T@2Z1z7TMA<%tYHomS37x#~0Ja%?W#{LIL~>L)SGyKsSb*V#p%Ubm)>(fr?6xjRdSj&L)nPB~PI)2rQ&IW>8JY zj$n4F(xHJ|S03mK_)}`*|FHK{-_{|rFlp?uWg{wTq-k^8Xo{}P{@N7)%OhcwqG6tr zS(&Q_Uh<(+bk(src{9B@#Fd0Ji(me^qD3A0RkXI-`vbxXJ{0AcT3-ZNR-szw|N8e0a1K zUs-r_Vh}xx8b(nD(N8p+ah3w9SYCpk*0y{O%6gN)tmIz}i9Y3pvUjk|;Xo?wl z(8{$DKo`k$mKrv*nz2m7rse`&Li*U>Bce{+Oob8-1fLYT{gghjct#vtg$>dZG)k7k zT&$mLbu{P;2x1oC{+&ZRn26`8Lg`{k$xZ)=!xV2qopQEiN1L`3#~Z(t7QPRvEUSkJ zoq+8bybGs?-$_7v&PrziIo?Bep;hb`kZXSw)ucOSf{K+`0%!znmA8z3kYQ_EB;}Ri z4espCQo@BKLr!qwF-%dAB?68e15~_PXBx?f%9W<|D;W1P2IUxux2|G`*zC>51Up-d zU&oka1fC=!8P+!F&x>L4#yCD5xLdc`1=_yO2=sRVZG+WHs;Be%~e=XbdP|TMf6`7HAAtSTsHatt7Rb=l> z@a3G`ivP`kQAz+PKJzYe`@Y9%`0yX6v4_ryd~NVleu`NVM>HI@60UbjR<`8MXT;Fe zc)M>7Vaj-Rbmiu#Ve3a?g<)#`WOba=7*gC8i($g@1Nfb@5Z$3&&6Giq!0D(JQ_e}7 z%k0?F^z1s+Lt$bsrd{H}6-Ck7r6i(F+^LpSSo9@-tGx?bfFgo5!=nf!V^lQ!C@8Gb zT7sT*wGh3aC8(ZKUubG{>>o)wMO^1@&%4M()^s+IRMyd<7Boaht(PQVSixD4+-)=z zcB%TphT2bPF9)}v>N>!FhRo&x!84dWp9+Er68+Y}GoP z$NVWPrIqIJe=B-a5_$QQ?pEWv-8n3 zNvjA)z~RcZD!XVWvZ0ZAiiwj95l4{lh>@?PX?p@mlkX25@Fm&n%|ya}bMuIk_Ci_6 zL$#xt2E<8q;Irn(a9SfXfXRXa#)oHdnc-oJ_IDdkgY`y6tc#c@>JS{gdjqIDUJMJ`la!ii3C6hc- z@=!k|g_|^Ho6&0%kfbOncAVy}k>5@TO-d?D4b|How7vD)*n7o6okAp_)ea(e^&H3@ z%%*mXzOf}@YttTB% z|G(Ir-2n$M^bjTyHQG!$`rjq1kx1h{EZ5F%v?OF0{3yFx6yaQmJW_O`d7uLcW=SYqVhapJ+TDEXjH*D=5%u~8Gf*sY zWvju?19bej0w-weziSEFArMOOL9EDS@4CB5k@vBxk>6M zX-5&hG5)|Sw@~*92{5&w_*K%cCzx3{uV<*$)*%fzqs)$u%*Wu|M@wzR=^v6VbvX3* zMJUwRB!wc39;v)rIF=O_caGhsXrz8%v zLu0>FI?ssDKKqGgj~lfZB3Wt(CK))D+9bIQvF=u}s6~LGOnNeO%?%PoiX0@Z)YdD4 zYdnTy$h3a_dlHi1mC-63=T#aMO01FaPyCtK!&&i@$>Trh4?%R=ku4a*|GdOE8nyn} z2}G@10s36k)nw`3xAbDO%9bd;Ln6a8$$11gJEU^){VBC0;m;wGKA@yA8C~d?Mg2Hj zDQTrP)E$O_+R!1yppr@S=)zO8iKGsNP?5u?)Z~fuI$B;~{cHODE(z+fsr|g;PAIIR;hZ5gH)<|WI+SFma(F)XQO>8Myb5KVB;E?)B zR}-n?@A$$p$I)iBlB24XzTFnUKXmf}pkIm_<=}YjEim>Q#%y*ezNx=}!0t$?QapV@ zcjL0m@uL*?!g5%Lq#i-c?A)wk71?dSjGnu8D3nAq}CWt zi~t+LfOhc`3V*TVu1*@Aeu@Zcj8aJkW^hVNDndr3R=Y}(iLn;=6zNQQkwVRGq!Vgw zIoE+FaKnzRbIC-dgQEaZX_)Qn*QU#}%@8?4t^kctk5nc3a^_`mva*!KWIx0`;Igh8 zXljbQF&+gXHddIoWC9WNcK_XK$P5lt)%ObNbikCH%lt&A*mmVM-9sS~{ZwunrsbsE zlx|WrAiyAz(7k}rRi2U>#UN+_s6ZW`0A>}cAdMPi!u_rhazm(3oloh7n8jWKBTe6= z$VjKlf=TbYIXiqw)Eo7SEMEgeDVPzg7^Aod4$=xcTRsj}Nnk*543(FH-Ab*;-8&3B zf(80etli1|C9rGAq2QzRb@Z_RqJGC}J%&MDSBBFL1UtyTArFoNA_~dMAUZ}$uMCm)~y{QGtfDdk{0!OQu4bE9)0V7W(f&rVp9Z(gks$`c?aeLjpb z#;~MZIiiqSL>Ffb4moL+aH0Onc#9Yg!&%w_qBlCOuULR^XTnO0K=H}w!G>oBaaXuY zv@DR?-2h5LZE}LG0W(kumrgVE<%?7uNFnx7DD*t(q041dL;JegIbwQ~ZkWX17-v~I z`AO{K0#3rBY{s0nw%iU(i*1L~et?LL0#GlJXLrIdSl(~Fw?n&G9HZ`_NJ(+%2Jj0V z@+|`ThXFokoYHk)iKStU^(gTY+^qDN00)nAB{VQEc6Yql1oq~LMpPCshBH3e20Xg^ z<$28BvkHx3E$9TsXwJkc_4wUsQ?mh3ZN!@-VMRETu6Dwjz~{LZW+k3UCQWjdX2E4^ z63~M~;~;)NLm1R*3CZI#xBe3ZwAebu2Wcoy`#+t0uHf4v#MwlkXeprz227v?A~H!s z%G)i*P*^>~5)=`9!xZL)P!`k*>rcx6lCpw#JDm6n6>!N}7jSHb{Ax_5hMf}PN`+X1 zBuaYdjkPF%3$;dg@xWgw_LYDGf>em4R(Ow~m<|9rcARF{RqOCBU>htS{w%&KeVQU8 zW!1Z}rQ(&t%de>yheb*%*(obZA>m9+x{_mtMs5JwB~D^KJJ5%eF5~sIs}T)Br56{T z!By0{42N8P;53_N52~}_kHm~wJxo@`x^p5|0aLm{v##_b8D+nie0beN&iAp{wp-$U zZ2Z1Aa06YP!XD)slS{2EieEC^)1Fr$yZU4sNBM6Q4?$-~3s9PfLczm-O=8`%8Y2+BIA^KT z4iT-~1^%Ka|9lC=e~*&+C~L|R|PugRz0}B z?3puP36)|6J)MB`S{jc%jEG<~+e};(74x%+W{b^Z_<{T5>Ix^u z+lQnm7piLpN_l@Hn5*srNTohal{gD?nS#oj(hxV-$c8WSqJjYvYV`S17bVk^gC4D= z$DRNO;D)r>YY#XpTpTheVguzTkD-DA3(JhC7OzEbuboMAFdOES164M44S@0yHtN1` z1FNGMZY}51l59{x!_QHIFHV!$cDa2Ba~*+0c>#%Vbf zH+%RHi2x3hPNL-COPGl73t<`ZL5KR5PttdbpcoUu<#}enfSn^D-WyOQ2kb15V_+z|+p}+?Dh+^|n}FG7VMzU5IFiQz?fc{A?rkN-5Fj-r{DdneDTb!Xy zJ)R->kQzb?4Y-**q;vj$QlOBcKwcHWO1B6T8SG$FL4im%eDa7i!8S8H2g&#xJOxQ3 z%{m}<9iGH+dqTtj&_agR^i!2ma5gXP^}6M_Az+S79~b;_0FYV;PFjd}2C^9UFK7%3 zU9GkL;7{j^h)|3xM54WiILYA_Lc#`@&~-ChgiP_lR)Qf!G5Dr2mc-=YEb5t5!&Ffj z+lDfU$o;V?XY5>og?en;7s72p_$@cJMf7a%lnpD)8t4?KWs%Xn=q;0Hr9lC2@kpOq z=h%p=zoUTY%H3>PU3f?$m+2(TwG-=zVL~V(Mna>kUr(?}xI7Rhbli`-^O!^NTFBj=)3PXl2jQ`7~2STZWX|EZEz1BBf@lnM`v(uMO`36feL|Y$%e@~K8H-Xxhd>d-pCHf zw|t{|Qjmm)fHicP(?p!G0$gC=RtBpYieH1al63a3eLUAI=I`ziIr#axe*M=RAAFui zuUV$IhcoYaZBJd@T2ytD2X;jx^=0=j?RY?oAdg4secUcER=4CS-i; zyF>(qka4S>QT}zqRQaoYz>$Kjqwegs%eV-t4R24Or;43;v( zq}hl@f^ui#s@YNs6=F#~pJp2?K7X>_U&C6g1cOols*I?hEZ*2~*1xb%nI2&4j*G9y zp9wiBX6(&BlfHkVv$=v}(?&;y2jyxAL{%2LhK5T!ZWJ{pHz=S0^><5!!&{gn`SPw9 zTG;Cd1Wbmfbs&pFHY*a5kz@0TGFcHpXuEPyqvFZE@@Gpo4*nzBp}?3>Cp188YKq>_ zgE-!DCD1HXG1$rptW6OYYEd2@tfudea!D}J!W&RxGdfu}+>mw1s?kbwRmUZa0hK^d z8}OJ!+zGJ%&=^BET!mGj`EzkAj6A>8SzIC}RzZqwfRU zXf+X+TG`}O*GW?@pcA`|HDG;@!`$V4_W$7Uf3#*V1dTwq0S5gFJBgx{wRlK%Q9eD; zcJ?n5ni0IXU0_&%rt#EAV%n5Ny;x0jkkoGlW}+5N(jEA}&6^Zn5lQ<`q8cJX)T}wp zrrH<3GKy7m+oO1qIBuY5S17D6gkl(4ZV{g)ennP+{3+grUIRr#vw*=x625AJ#K5u% zsK&%~Qh-x`B)UHQBy$8ymF+4$aEk(#C=R9t1#9yYen%M$B)*~ zwU(qtD*b0m|nckJ@=uCp6Tp{v@q1!I19$`6EN3T)Hf!9nx) zS)4E@B7RYsW8z^usJ4#Gf&T1D8tgB)08m$LeoG7NLRty=-(FnfA@P($WH=;cV}rr%!Lf|3o->JMU`iEd5Kw|e zvL3HG67UQZm5VbXOw}9fXWY5aci2m!nKW6~8-Z9?CTkA(pd17m@W?7wFc9KyBen(8qxuD_wS&+s$cXI) z1Iw)GDV?Os)e@)fmC-O62gCy)o`(&KLOCGUbt{12Km6u!BfnTYC!)!=O^pE_{B43= zqJii584g9r%*5389FA<&jiI#i815UJB&?YY_Pdm(w$Twp*NtC!_&2_%>w0W0ASei0Z(6j(Tv>U) zNUKsNA;xdUZ<8~22E`bAT(FKqB45C~8MLJigD(S33xbyU}u3rfgT=6xU@ht3on=fe+P1tHm7kuycl z#Tl!4q(q?LA;*+ti7!blv`#uLgnVMK<`@u;@U^x#w1=P5<{P=Dw_a?M74z|S+-MB+ zuHQzeQhl0KpuxfY3<(ju=+_;;{9Ut?pG{qt02U65t*&#kgHj{=aPh^PHCy`7qZY`mq#8Nsy4AHT zhaPlbu%sN6+juQniG&}_t5Uc5ShvnGUL~~~8{I-N1clVddL@J{_K~OzMUzA67H+np z{E|E9z}UVNHpH+{Ja19pa*r3>Fkkj3Hm@>CpWEk>#j>@z{o{B_70VrfWe4^^gy;j4DCX^2F#jZOW2#1n~NuRd+9+hO9jju$!1VIP74yhaH~OmVUxq&$?!K4EeD_EgBKf{!k!VN=xZPG zTkQ7(mj@RItlR>3IcaW}5SE-~!R3m5ctBHrkHJ#;&i{;fw2>3%zrfeHQM)>&dURpe zQ>o?-MQPaZBvet6L$&pUvG@DC#0DaTGlIPy?VjZjUbIOl)b`TKy`_=!loS9Dg^K0O zQ_Cr>i&=qq9nZh*_Ml{4b!}QAy+xsqJz=I249HEw9vtxy<>UOOa%CVC7@=^y-un?5 zd?2gc7}UfvLL5M$d4A~uzn|d~+`^=J%18Ub5p-`&!!TKB4phRoR#sV(b1nLAl(*a{?8}UQ5pV

?5&|K+bKfN?Il8*@tFQ1&$%q=BW^gEU?60)HcZZbC2arZg6l{D?Pv!@$Lbgo?68?6u4i7Yc6f{vRrj?}!1m8S(<%FdLeEdOr+o_5S~q3M%&Pj~bZ0{9Z%(GLMOmk%m0 zDV}HXhuAH0p{j+0`YsV5%|ZpHIYRBPmNX%sz>Yp4V)GmU>=L;IHqhfBt~)rpWqe0^ zs}hl?20jwrOCo+j14Y}D#SbRJ)(&5O79l2RPx5$`3fw4Ce{0dXi$oj@+}Wv5(@&@1 z&|dQ1>O9ukxkDu|8AT+69W_IO%LEh)^%&xxxTX`gp}`aCP)|wXIPY_nX12<0NHmIR zs~H<}B&q7v=;7io;x0~5V1qEt$#~K4l;9v(sQ?K4olM1mgcfZU)&xjBP<&KSnAn(K z9tI!Np*1WF32xYcUex1pRR;u8YkSy|G}N}oFt?gCB&tEgO+L62zdE3>LvP@VWUQ%j zUecJ`f+O532Ndhgs2y-UkWpc$I>5K&!_nePFRNc^NL)GH!P6G2v0zc{6!*(c#dw`U z2_)0E{~_h3$S{m=ljktm&2$NlYr8?pmL=juL6w};jDR{fq)AFtM837Ai>Q$2Rb0`^ zOz|$em6AwFZ8atp@fWo#7>>{p!wQz<4{``+ns&kPbO@y4z`Sq$@IV6?FL6udF=Hk6 z>Wpfqm4kbOXgoZxiZPY^v}6@gXsap(#V`-2!ssWoq=Fs9)M&xE)6m+&8_EP%=&pha z0E#FXxN{WAUS_ep4VEJZ309uyQviXdE#)D_+rWjYS8k`J$PzM_dXhV6B*>IFI}lxN4F*9*Z^C>9XJ@io!Ojqs z6j~;)dE7|&%*}&HT3tcSf17_AUGR2L?sxF|*p9o@E7r@Ti-Mr-2up2_s+dN*8)j=^ zMdsFCx&!pcpbzcFi78Wb#zNgrj57R{;VfbcOPemK4`V8GR??YB>yC)6cm~&?MgBXp zk_~_Zn>`(gSj3eGj?-rzOT;1?Pt>=Dj`0`|E;shNZ4r+pt$5G}t!C zax|m!B=%UrYJ)lfPuV>VGh%ZmO2Z(uQO{M> za4QZ5tycl@t40JWs2Y_+IfW`#IQi;b(mTW9y3KS9lXS{612RPIh4!CH(A z2ssKyG^}o@k3qK(0=8}eI4w#%qY$*7yEk#}0A(@2Qv!>`HHcJtL3Yh6!fIia@jvE9+rk;ll9U#9KI z+S+(p+Q9%y$B-onBWfe`;JFD`bbfEH%FC+0;5&B>o?d0@WfJa!mjy*G31;y@iVD1I zJSZJKWGrQ-1A(B2w3*GbCuA^SF4IcXI(}R6a8|R?zZr`_KCF)!eVt&6390iQ<5F2v#qyAL` zGz}H8u0=-V_uCmFWDd*|Wr>Tt1TBH@%hWHW7Z4<|Ml55*Iw{7~6!#(=Sv8}BcL!Dx zetK5~gWA@R5PmKFwb^VWgjPzAqeZrlcyW4In~8bb)YQer;OLr^FzkX00eADTK4C zpWS}tK%B%WpqCfoN+N)gQ$F=br_}vb{k^8%W`%2!1rJ4$J+JtOUyF;dA|M=$<;kwl z;|@7%qSVZPfb@1EhXWWYm;WaUtyxC3hyUG-0LuoL*O&!f%gYgRJ-`{!wg{3;H*|C0 zh-cd(;)4Alz5?XKvb4c7DyD}dF_AFt$6i+c)C=R(iEYR{DYovlQyUYUt#08Y#eaXr z1BbZN{|o9Hx*97Oo``aJdk*WApme@Zqns8d+tQgh9$mypZCcctL>G+hm~k&IhDd9u zTNr;nk+;pGsi_=}aSV-iR#bEYBhSLhK-h3U5Q4@vOoRAAAf{p6NG@vGRMUJhFFDQ1 zYQGb!HauFq=_fjXC6@Ii&o4+j0CVg~&tD9tVHNBiPSJ~$2l*EyS&FFMnuFx7qb>G* z-bHO$@UMnz<7Tu@@TEc;PnZ z<~Y=!U#Mm%bpd3o210ML;+wHe6*mYtR19c|7ZKlTCxgsC%(k}a7XxOx9BndpdQLP? zi{gQQ(9Pdb36qgZ*3mrS!0j-&;fF6!$=)15GYIdLIR=D9B_OGdfd`WJ35$~EyhNFc zusRuxvJ>Qo%5oSEd8FL`^dKzH?g>O98Pj&&ChlMocL%iwwa#zg!kKqeep?qMbKoyV zW=VUVpUZMEds=&|F&l%;Wx-gmBr}302F4x!4y|lbqryd@x^0mpmq%s-b<*|5bjVo5 zFAU%MF+%j9#wB9aOGBK@Yey()5A6(^Kpa3R8(M6e8Hy+2TRd;$)5us^^?s8JwI@(% z%-(<*P{eYa$qZ2*qv2I*zwow-gMYPd<8{0yr-JsGB%H^x$4_KEIhR2OFLHTyB~zKk z&S!<=cu8=f$2oZdm(xqLoMlL96lKAK^6^a0gzjR!CWA5O?WM9%srpXbD@v9ZVWt*l!+H9*{cOU^c5jn2J+?W5A-w%M! z9bMwmyJ*Dr?fTP}O^5NfTOeqN(xKt5&8^YL1!PilsnI@Y)+^5yfVR>uaYb;0L{(-Z zRW^DPh+XOpx8E3$`7zzW&Ni*8YNLfQ znUb>Ihe0_gUF-jX_kge1ZdsiBReK??z zXFuLiggN!L0T+@Torw$id;vA=O z9>CGhuTN;PsCRE>TV{c!voB*HRs)^fe6B9TVh7IeIE21|%Sf7Ou=TeEO3Ne(dYGgO z=LTv8skQ|=JQ|7_yJ(GPQPrBKl8Aap{6jrP^*0*!4GcK1nz?MyV9m{3#>R_k{zPHk|tC-VrNDp#z20zZKz32J43 z31iH*V47NEj6edMoI7H;2?g@K)!60p-*F-z9@Adhyku~W0oC9$Q7uIvt9)m-c8H8* zi{GGL@SX;-3k2VHRrw#s1+a7hbl<}fTw20A_Ia9~%9BJd)Qp8_0No5o0As&%@W7o| zV0O_)mDmQQ?bm7geV{C&`C3luD%^>w`aZ;odf zw;E9sICQO)&{8bO-ETJ0eZQO*IXdI;?%Hy#d7C-YDXVDR4FqeB^>>HxzC{Eu2AKHb z&g|3WxnD+je1c|51e!3nQIM_fB)XPW6h{fmG_^P6RxvBVZ(lV;qSi++Rq$uHf58WP zvKqiL(^-VA%v)R^l!)X4b>$F$Fcd2wriy$x-2z7~%4CBqB^7DnAXTT1)<4=thqgl? zN+jEGYq-}fS03;@;_vp-X3Y~63XN1Kdg@G$;iU%NrAe*?X;Mn?ATXQLKK}H;Tp=E3H$6OZBuTi?5aj9|X zN!o9T);U`v%-#5Dh{lxGFDZATcCHjd0p!`$A+fc&S8b=A!6G6p|C%pxR%KJs$c+3I zreZ9LB(oNb>sFuw$&?>zpg(O~^ak95QO4Q2^pE^~?3A>z5C^R{A zlY zJ8Ot6_qI0V+xntGG~&;A^YCwJ-N%d{@@A){rujokC(y{aezGuu?<+I*ZY~#7EDVAy zh=oZciR?M+Rq5+}H-Mp2mEJ`B#S>ZD)!6^&tND&F%Q6$olBAK%(gCW)vrW~_r2ztv0} zi}_djZ>8bBx;K#&KBX$n0mGGkE z-R@KSnrGOGeOSm&i`-dTp2Xr#8JrI66u%veXbe6*!d@hJCGCP|tbGB#w|g}$Xb2%_ zwB0XJ)ZG36DkY=1czg1AXy<}c=uuvs(a+s5_(DT7`DRNEyjJosXK^XcCoZBKsZ*oq zQ~to%f##|)X!Zc@+q~z~$^`FDZ6casrCvnL7SR23Z~zs4>KLaZMK&%3PJUdE9SDoX z+*N^LI^nfRrSMI5kf48jLpvPA{KNU0)R zT!Lo5svY{?QCW4DMwcCF?X`tyasnLN2!fV%uLzhB5ewWEK&{0EYolg-QXR5F$Ubik zG&oXz6W17VSGMCZb=*~uEiZ**1D%t~x%3ta_Q(G>#-Eze_g4j;!C82jh7Ei)m*SD2 z^LC|G!8wwIJ+XR}=eM~I!j456r-h5RF&q?2+Utg>K!G2H6*P^!!lg6hgT9B@a1#=l zcMAHGKATb)F^j-x@>OU1{^lpB@8Gk2{@rtVcs4$v$FqL!AoABh-=cg|c)alcFaF=c zGK{OsB;SQGhZBD<|M~wPVf#hH))2o-a5FnQcQc#!vvvKvl6-qE0UcK`L9JE|dHk*z zSan}ctizGVG7M!9ob*U7@FZJtnEdqrvfWDqqVLbd9wGlRb>r;LC2r?v zE8Gd+N0P_Z{FBx+pMGAPJjVcS$dx}V3B6@MV6{ZgKj1wO9s1jTF5WM=Ha?!N_YdDU z{XD(p{$FZHZ!b7L=)^u@W$?@0Y=i$oE$a8AVHs2Y$?`Jc2f4t${mTqN#b$58Q|Ht9Q z_rpW()W7$cgO!omSrYcw2lhIXogwCW=2jBTdZauDuMxEIw%wg=Vuk$P&W^)zL%q+v zp}lh#x0Zw1%W80c|Bs?ej32*0yL*56cm{;^{^FYp_(_Dj#QukT0KN{);!HQsPDZ~3Ay^Y{@QuIT=VzG7ri?@?TCB%x!@D%PbS_UsR`cr*FNf> zF;&0rAEqyKt|-)h$-MpT@BSmqXQL4UlCIbrzUK4y=d<-KKio{<;|KYUFYP=j=HhwB z5t;iVTyOpTIs56!9+lbuy8T_&ynB3^SoMi`@6z_;2b=`tcID@t^PeK~PVu*Yz#8k1 z@XZf#JV?n$Pxi#!4h)Et0@0V+!*6kr@1>sq_EZ=I?5_7yxcuGyEB{&fe~>!skN@<~ zPsisEGdc6OtT&M&a1Qy$R;S@%+#K6)XhP`}}7ofJUbaQaVrV^?6IL zpE2+DSv%j(p1tgc_N?#q@baA2{=-W$Jtt$%2jeSA?-nfakZUU5xKLpF@!8!m8(Wx^ z-}x%kRAiK%`(erX9Z>51hXQzi<8n|IJiGer?zhUYv>0c1Cl|LTSJ!tZXUAo_Rv7KP z>G{dup5618|9bM%Yac_b6sNhq(g!}de|dTF>N}S6C+Er)zqXold-<}}SNYlNxt%}b zVXn_!&eyM=z07HQ)ehUM{p{73SM#sGw}oum_6M$`k9@2H{CHk>XSct*ee%P;{)GB3 zP56)X-w*Ke{(lsj{jm_e!w(<~e@gv-j`Xc)vLpS9$#bKhPaPhk)OQ*0pguiJN)=}6b~c5 z5W?`=YtAa1=u+tOPmctXcW7L`{4bgR_jRQNpm4cqjyt544^FV2rc6wayM z0M^~UAl1~Hhw~RF|2oI^uct~}@2~imH0NKR!{oR==M)!libdf1`0Xhkmhj<_n}a0a zmip`E#ntbSlDoT4ZeM%^QSkQpo5RKBMWn#bUtPcX$d`Nj-@g6)?~$qbqWh7A<2@3! zj$=ac`y@(-Hs)K*)ZdY)_9tlhUxcCEUQ|y^?^}P*9;_pI*3kR*K!=v++AWOt!xr?% zE%@fu+F#_sQ*~1;?00OzIE8#G?(dZ-UcEYee18$&o5RVQi`!f6$f?4~^OMi6?@wOdUtIcciZFE$U0WwF58A!M$=%Io z-n{D*-S4?}tZWe^dy$KJ^|Wkzb_!`Q$-T^F2ml9XU&Xuc3}Z2aoc5 z$4KnoVI;oMNI%+%M?3N3uoK^|V2j>CL-u_`M*AozO}?06{8y?(40KE57bkFUqq Q|Bu)I10^F{h5+IK0As3ms{jB1 From 3f212a82029d995303ce5372f62396c2ec0da976 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 12 Jun 2024 18:13:34 +0200 Subject: [PATCH 087/110] Update tests --- ...__test__import_dump_v6_with_vectors-6.snap | 2 +- ...__test__import_dump_v6_with_vectors-7.snap | 2 +- index-scheduler/src/lib.rs | 20 ++--- .../documents after initial push.snap | 2 +- meilisearch/tests/documents/get_documents.rs | 8 +- meilisearch/tests/dumps/mod.rs | 16 ++-- meilisearch/tests/search/hybrid.rs | 30 +++---- meilisearch/tests/search/mod.rs | 10 +-- meilisearch/tests/similar/mod.rs | 48 +++++------ meilisearch/tests/vector/mod.rs | 10 +-- meilisearch/tests/vector/settings.rs | 80 +++++++++++++++++-- 11 files changed, 149 insertions(+), 79 deletions(-) diff --git a/dump/src/reader/snapshots/dump__reader__test__import_dump_v6_with_vectors-6.snap b/dump/src/reader/snapshots/dump__reader__test__import_dump_v6_with_vectors-6.snap index 0aad0ea97..a9c76227a 100644 --- a/dump/src/reader/snapshots/dump__reader__test__import_dump_v6_with_vectors-6.snap +++ b/dump/src/reader/snapshots/dump__reader__test__import_dump_v6_with_vectors-6.snap @@ -780,7 +780,7 @@ expression: document 1.3484878540039063 ] ], - "userProvided": false + "regenerate": true } } } diff --git a/dump/src/reader/snapshots/dump__reader__test__import_dump_v6_with_vectors-7.snap b/dump/src/reader/snapshots/dump__reader__test__import_dump_v6_with_vectors-7.snap index f2a5e1d69..e5d28e450 100644 --- a/dump/src/reader/snapshots/dump__reader__test__import_dump_v6_with_vectors-7.snap +++ b/dump/src/reader/snapshots/dump__reader__test__import_dump_v6_with_vectors-7.snap @@ -779,7 +779,7 @@ expression: document 1.04031240940094 ] ], - "userProvided": false + "regenerate": true } } } diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index 4278d15b3..88997b715 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -5077,12 +5077,12 @@ mod tests { &fakerest_name: { // this will never trigger regeneration, which is good because we can't actually generate with // this embedder - "userProvided": true, + "regenerate": false, "embeddings": beagle_embed, }, &simple_hf_name: { // this will be regenerated on updates - "userProvided": false, + "regenerate": true, "embeddings": lab_embed, }, "noise": [0.1, 0.2, 0.3] @@ -5211,9 +5211,9 @@ mod tests { let embeddings = index.embeddings(&rtxn, 0).unwrap(); - // automatically changed to patou + // automatically changed to patou because set to regenerate assert_json_snapshot!(embeddings[&simple_hf_name][0] == patou_embed, @"true"); - // remained beagle because set to userProvided + // remained beagle assert_json_snapshot!(embeddings[&fakerest_name][0] == beagle_embed, @"true"); let doc = index.documents(&rtxn, std::iter::once(0)).unwrap()[0].1; @@ -5256,7 +5256,7 @@ mod tests { "doggo": "max", "_vectors": { "my_doggo_embedder": { - "userProvided": true, + "regenerate": false, "embeddings": vec![2; 384], }, "unknown embedder": vec![4, 5], @@ -5267,7 +5267,7 @@ mod tests { "doggo": "marcel", "_vectors": { "my_doggo_embedder": { - "userProvided": false, + "regenerate": true, "embeddings": vec![3; 384], }, }, @@ -5277,7 +5277,7 @@ mod tests { "doggo": "sora", "_vectors": { "my_doggo_embedder": { - "userProvided": false, + "regenerate": true, }, }, }, @@ -5768,7 +5768,7 @@ mod tests { .unwrap() .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) .collect::>(); - snapshot!(serde_json::to_string(&documents).unwrap(), @r###"[{"id":0,"doggo":"kefir","_vectors":{"manual":{"embeddings":[[0.0,0.0,0.0]],"userProvided":true}}},{"id":1,"doggo":"intel","_vectors":{"manual":{"embeddings":[[1.0,1.0,1.0]],"userProvided":true}}}]"###); + snapshot!(serde_json::to_string(&documents).unwrap(), @r###"[{"id":0,"doggo":"kefir","_vectors":{"manual":{"embeddings":[[0.0,0.0,0.0]],"regenerate":false}}},{"id":1,"doggo":"intel","_vectors":{"manual":{"embeddings":[[1.0,1.0,1.0]],"regenerate":false}}}]"###); } { @@ -5802,8 +5802,8 @@ mod tests { .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) .collect::>(); - /// FIXME: redaction - snapshot!(json_string!(serde_json::to_string(&documents).unwrap(), { "[]._vectors.doggo_embedder.embeddings" => "[vector]" }), @r###""[{\"id\":0,\"doggo\":\"kefir\",\"_vectors\":{\"manual\":{\"embeddings\":[[0.0,0.0,0.0]],\"userProvided\":true},\"my_doggo_embedder\":{\"embeddings\":[[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0]],\"userProvided\":true}}},{\"id\":1,\"doggo\":\"intel\",\"_vectors\":{\"manual\":{\"embeddings\":[[1.0,1.0,1.0]],\"userProvided\":true}}}]""###); + // FIXME: redaction + snapshot!(json_string!(serde_json::to_string(&documents).unwrap(), { "[]._vectors.doggo_embedder.embeddings" => "[vector]" }), @r###""[{\"id\":0,\"doggo\":\"kefir\",\"_vectors\":{\"manual\":{\"embeddings\":[[0.0,0.0,0.0]],\"regenerate\":false},\"my_doggo_embedder\":{\"embeddings\":[[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0]],\"regenerate\":false}}},{\"id\":1,\"doggo\":\"intel\",\"_vectors\":{\"manual\":{\"embeddings\":[[1.0,1.0,1.0]],\"regenerate\":false}}}]""###); } } } diff --git a/index-scheduler/src/snapshots/lib.rs/import_vectors_first_and_embedder_later/documents after initial push.snap b/index-scheduler/src/snapshots/lib.rs/import_vectors_first_and_embedder_later/documents after initial push.snap index 433a190f9..d2473d00a 100644 --- a/index-scheduler/src/snapshots/lib.rs/import_vectors_first_and_embedder_later/documents after initial push.snap +++ b/index-scheduler/src/snapshots/lib.rs/import_vectors_first_and_embedder_later/documents after initial push.snap @@ -1,4 +1,4 @@ --- source: index-scheduler/src/lib.rs --- -[{"id":0,"doggo":"kefir"},{"id":1,"doggo":"intel","_vectors":{"my_doggo_embedder":[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],"unknown embedder":[1.0,2.0,3.0]}},{"id":2,"doggo":"max","_vectors":{"my_doggo_embedder":{"embeddings":[2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0],"userProvided":true},"unknown embedder":[4.0,5.0]}},{"id":3,"doggo":"marcel","_vectors":{"my_doggo_embedder":{"embeddings":[3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0],"userProvided":false}}},{"id":4,"doggo":"sora","_vectors":{"my_doggo_embedder":{"embeddings":null,"userProvided":false}}}] +[{"id":0,"doggo":"kefir"},{"id":1,"doggo":"intel","_vectors":{"my_doggo_embedder":[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],"unknown embedder":[1.0,2.0,3.0]}},{"id":2,"doggo":"max","_vectors":{"my_doggo_embedder":{"embeddings":[2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0],"regenerate":false},"unknown embedder":[4.0,5.0]}},{"id":3,"doggo":"marcel","_vectors":{"my_doggo_embedder":{"embeddings":[3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0],"regenerate":true}}},{"id":4,"doggo":"sora","_vectors":{"my_doggo_embedder":{"embeddings":null,"regenerate":true}}}] diff --git a/meilisearch/tests/documents/get_documents.rs b/meilisearch/tests/documents/get_documents.rs index 3bf3727c4..efe4cf8e9 100644 --- a/meilisearch/tests/documents/get_documents.rs +++ b/meilisearch/tests/documents/get_documents.rs @@ -637,7 +637,7 @@ async fn get_document_with_vectors() { 0.0 ] ], - "userProvided": true + "regenerate": false } } }, @@ -666,7 +666,7 @@ async fn get_document_with_vectors() { 0.0 ] ], - "userProvided": true + "regenerate": false } } } @@ -694,7 +694,7 @@ async fn get_document_with_vectors() { 0.0 ] ], - "userProvided": true + "regenerate": false } } }, @@ -722,7 +722,7 @@ async fn get_document_with_vectors() { 0.0 ] ], - "userProvided": true + "regenerate": false } } } diff --git a/meilisearch/tests/dumps/mod.rs b/meilisearch/tests/dumps/mod.rs index 6f93d94a7..fa402cb41 100644 --- a/meilisearch/tests/dumps/mod.rs +++ b/meilisearch/tests/dumps/mod.rs @@ -1977,9 +1977,9 @@ async fn generate_and_import_dump_containing_vectors() { .add_documents( json!([ {"id": 0, "doggo": "kefir", "_vectors": { "doggo_embedder": vec![0; 384] }}, - {"id": 1, "doggo": "echo", "_vectors": { "doggo_embedder": { "userProvided": true, "embeddings": vec![1; 384] }}}, - {"id": 2, "doggo": "intel", "_vectors": { "doggo_embedder": { "userProvided": false, "embeddings": vec![2; 384] }}}, - {"id": 3, "doggo": "bill", "_vectors": { "doggo_embedder": { "userProvided": false }}}, + {"id": 1, "doggo": "echo", "_vectors": { "doggo_embedder": { "regenerate": false, "embeddings": vec![1; 384] }}}, + {"id": 2, "doggo": "intel", "_vectors": { "doggo_embedder": { "regenerate": true, "embeddings": vec![2; 384] }}}, + {"id": 3, "doggo": "bill", "_vectors": { "doggo_embedder": { "regenerate": true }}}, {"id": 4, "doggo": "max" }, ]), None, @@ -2096,7 +2096,7 @@ async fn generate_and_import_dump_containing_vectors() { "_vectors": { "doggo_embedder": { "embeddings": "[vector]", - "userProvided": true + "regenerate": false } } }, @@ -2106,7 +2106,7 @@ async fn generate_and_import_dump_containing_vectors() { "_vectors": { "doggo_embedder": { "embeddings": "[vector]", - "userProvided": true + "regenerate": false } } }, @@ -2116,7 +2116,7 @@ async fn generate_and_import_dump_containing_vectors() { "_vectors": { "doggo_embedder": { "embeddings": "[vector]", - "userProvided": false + "regenerate": true } } }, @@ -2126,7 +2126,7 @@ async fn generate_and_import_dump_containing_vectors() { "_vectors": { "doggo_embedder": { "embeddings": "[vector]", - "userProvided": false + "regenerate": true } } }, @@ -2136,7 +2136,7 @@ async fn generate_and_import_dump_containing_vectors() { "_vectors": { "doggo_embedder": { "embeddings": "[vector]", - "userProvided": false + "regenerate": true } } } diff --git a/meilisearch/tests/search/hybrid.rs b/meilisearch/tests/search/hybrid.rs index b8a4110ad..be6e0b1c8 100644 --- a/meilisearch/tests/search/hybrid.rs +++ b/meilisearch/tests/search/hybrid.rs @@ -128,7 +128,7 @@ async fn simple_search() { ) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"userProvided":true}}},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"userProvided":true}}},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"userProvided":true}}}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}}},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}}},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}}}]"###); snapshot!(response["semanticHitCount"], @"0"); let (response, code) = index @@ -137,7 +137,7 @@ async fn simple_search() { ) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"userProvided":true}},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"userProvided":true}},"_rankingScore":0.9848484848484848},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"userProvided":true}},"_rankingScore":0.9472135901451112}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":0.9848484848484848},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":0.9472135901451112}]"###); snapshot!(response["semanticHitCount"], @"2"); let (response, code) = index @@ -146,7 +146,7 @@ async fn simple_search() { ) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"userProvided":true}},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"userProvided":true}},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"userProvided":true}},"_rankingScore":0.9472135901451112}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":0.9472135901451112}]"###); snapshot!(response["semanticHitCount"], @"3"); } @@ -207,7 +207,7 @@ async fn distribution_shift() { let search = json!({"q": "Captain", "vector": [1.0, 1.0], "showRankingScore": true, "hybrid": {"semanticRatio": 1.0}, "retrieveVectors": true}); let (response, code) = index.search_post(search.clone()).await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"userProvided":true}},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"userProvided":true}},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"userProvided":true}},"_rankingScore":0.9472135901451112}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":0.9472135901451112}]"###); let (response, code) = index .update_settings(json!({ @@ -228,7 +228,7 @@ async fn distribution_shift() { let (response, code) = index.search_post(search).await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"userProvided":true}},"_rankingScore":0.19161224365234375},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"userProvided":true}},"_rankingScore":1.1920928955078125e-7},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"userProvided":true}},"_rankingScore":1.1920928955078125e-7}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_rankingScore":0.19161224365234375},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":1.1920928955078125e-7},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":1.1920928955078125e-7}]"###); } #[actix_rt::test] @@ -249,7 +249,7 @@ async fn highlighter() { })) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"userProvided":true}},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"}},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"userProvided":true}},"_formatted":{"title":"Shazam!","desc":"a **BEGIN**Captain**END** **BEGIN**Marvel**END** ersatz","id":"1"}},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"userProvided":true}},"_formatted":{"title":"Captain Planet","desc":"He's not part of the **BEGIN**Marvel**END** Cinematic Universe","id":"2"}}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"}},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_formatted":{"title":"Shazam!","desc":"a **BEGIN**Captain**END** **BEGIN**Marvel**END** ersatz","id":"1"}},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_formatted":{"title":"Captain Planet","desc":"He's not part of the **BEGIN**Marvel**END** Cinematic Universe","id":"2"}}]"###); snapshot!(response["semanticHitCount"], @"0"); let (response, code) = index @@ -265,7 +265,7 @@ async fn highlighter() { })) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"userProvided":true}},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"userProvided":true}},"_formatted":{"title":"Captain Planet","desc":"He's not part of the **BEGIN**Marvel**END** Cinematic Universe","id":"2"},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"userProvided":true}},"_formatted":{"title":"Shazam!","desc":"a **BEGIN**Captain**END** **BEGIN**Marvel**END** ersatz","id":"1"},"_rankingScore":0.9472135901451112}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_formatted":{"title":"Captain Planet","desc":"He's not part of the **BEGIN**Marvel**END** Cinematic Universe","id":"2"},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_formatted":{"title":"Shazam!","desc":"a **BEGIN**Captain**END** **BEGIN**Marvel**END** ersatz","id":"1"},"_rankingScore":0.9472135901451112}]"###); snapshot!(response["semanticHitCount"], @"3"); // no highlighting on full semantic @@ -282,7 +282,7 @@ async fn highlighter() { })) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"userProvided":true}},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"userProvided":true}},"_formatted":{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2"},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"userProvided":true}},"_formatted":{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1"},"_rankingScore":0.9472135901451112}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_formatted":{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2"},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_formatted":{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1"},"_rankingScore":0.9472135901451112}]"###); snapshot!(response["semanticHitCount"], @"3"); } @@ -370,7 +370,7 @@ async fn single_document() { .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"][0], @r###"{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"userProvided":true}},"_rankingScore":1.0}"###); + snapshot!(response["hits"][0], @r###"{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":1.0}"###); snapshot!(response["semanticHitCount"], @"1"); } @@ -385,7 +385,7 @@ async fn query_combination() { .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"userProvided":true}},"_rankingScore":1.0},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"userProvided":true}},"_rankingScore":1.0},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"userProvided":true}},"_rankingScore":1.0}]"###); + snapshot!(response["hits"], @r###"[{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":1.0},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":1.0},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_rankingScore":1.0}]"###); snapshot!(response["semanticHitCount"], @"null"); // same with a different semantic ratio @@ -394,7 +394,7 @@ async fn query_combination() { .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"userProvided":true}},"_rankingScore":1.0},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"userProvided":true}},"_rankingScore":1.0},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"userProvided":true}},"_rankingScore":1.0}]"###); + snapshot!(response["hits"], @r###"[{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":1.0},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":1.0},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_rankingScore":1.0}]"###); snapshot!(response["semanticHitCount"], @"null"); // wrong vector dimensions @@ -418,7 +418,7 @@ async fn query_combination() { .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"userProvided":true}},"_rankingScore":0.7773500680923462},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"userProvided":true}},"_rankingScore":0.7236068248748779},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"userProvided":true}},"_rankingScore":0.6581138968467712}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_rankingScore":0.7773500680923462},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":0.7236068248748779},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":0.6581138968467712}]"###); snapshot!(response["semanticHitCount"], @"3"); // full keyword, without a query @@ -427,7 +427,7 @@ async fn query_combination() { .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"userProvided":true}},"_rankingScore":1.0},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"userProvided":true}},"_rankingScore":1.0},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"userProvided":true}},"_rankingScore":1.0}]"###); + snapshot!(response["hits"], @r###"[{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":1.0},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":1.0},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_rankingScore":1.0}]"###); snapshot!(response["semanticHitCount"], @"null"); // query + vector, full keyword => keyword @@ -436,7 +436,7 @@ async fn query_combination() { .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"userProvided":true}},"_rankingScore":0.9848484848484848},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"userProvided":true}},"_rankingScore":0.9848484848484848},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"userProvided":true}},"_rankingScore":0.9242424242424242}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":0.9848484848484848},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_rankingScore":0.9848484848484848},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":0.9242424242424242}]"###); snapshot!(response["semanticHitCount"], @"null"); // query + vector, no hybrid keyword => @@ -479,6 +479,6 @@ async fn query_combination() { .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"userProvided":true}},"_rankingScore":0.9242424242424242}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":0.9242424242424242}]"###); snapshot!(response["semanticHitCount"], @"0"); } diff --git a/meilisearch/tests/search/mod.rs b/meilisearch/tests/search/mod.rs index 19e495edd..c2c1b9fd7 100644 --- a/meilisearch/tests/search/mod.rs +++ b/meilisearch/tests/search/mod.rs @@ -1374,7 +1374,7 @@ async fn experimental_feature_vector_store() { 3.0 ] ], - "userProvided": true + "regenerate": false } }, "_rankingScore": 1.0 @@ -1391,7 +1391,7 @@ async fn experimental_feature_vector_store() { 54.0 ] ], - "userProvided": true + "regenerate": false } }, "_rankingScore": 0.9129111766815186 @@ -1408,7 +1408,7 @@ async fn experimental_feature_vector_store() { 90.0 ] ], - "userProvided": true + "regenerate": false } }, "_rankingScore": 0.8106412887573242 @@ -1425,7 +1425,7 @@ async fn experimental_feature_vector_store() { 32.0 ] ], - "userProvided": true + "regenerate": false } }, "_rankingScore": 0.7412010431289673 @@ -1442,7 +1442,7 @@ async fn experimental_feature_vector_store() { 32.0 ] ], - "userProvided": true + "regenerate": false } }, "_rankingScore": 0.6972063183784485 diff --git a/meilisearch/tests/similar/mod.rs b/meilisearch/tests/similar/mod.rs index 0a568553c..60a0203ed 100644 --- a/meilisearch/tests/similar/mod.rs +++ b/meilisearch/tests/similar/mod.rs @@ -95,7 +95,7 @@ async fn basic() { 0.800000011920929 ] ], - "userProvided": true + "regenerate": false } } }, @@ -112,7 +112,7 @@ async fn basic() { -0.20000000298023224 ] ], - "userProvided": true + "regenerate": false } } }, @@ -129,7 +129,7 @@ async fn basic() { -0.4000000059604645 ] ], - "userProvided": true + "regenerate": false } } }, @@ -146,7 +146,7 @@ async fn basic() { -0.5 ] ], - "userProvided": true + "regenerate": false } } } @@ -173,7 +173,7 @@ async fn basic() { -0.4000000059604645 ] ], - "userProvided": true + "regenerate": false } } }, @@ -190,7 +190,7 @@ async fn basic() { -0.5 ] ], - "userProvided": true + "regenerate": false } } }, @@ -207,7 +207,7 @@ async fn basic() { 0.800000011920929 ] ], - "userProvided": true + "regenerate": false } } }, @@ -224,7 +224,7 @@ async fn basic() { 0.8500000238418579 ] ], - "userProvided": true + "regenerate": false } } } @@ -287,7 +287,7 @@ async fn ranking_score_threshold() { 0.800000011920929 ] ], - "userProvided": true + "regenerate": false } }, "_rankingScore": 0.890957772731781 @@ -305,7 +305,7 @@ async fn ranking_score_threshold() { -0.20000000298023224 ] ], - "userProvided": true + "regenerate": false } }, "_rankingScore": 0.39060014486312866 @@ -323,7 +323,7 @@ async fn ranking_score_threshold() { -0.4000000059604645 ] ], - "userProvided": true + "regenerate": false } }, "_rankingScore": 0.2819308042526245 @@ -341,7 +341,7 @@ async fn ranking_score_threshold() { -0.5 ] ], - "userProvided": true + "regenerate": false } }, "_rankingScore": 0.1662663221359253 @@ -373,7 +373,7 @@ async fn ranking_score_threshold() { 0.800000011920929 ] ], - "userProvided": true + "regenerate": false } }, "_rankingScore": 0.890957772731781 @@ -391,7 +391,7 @@ async fn ranking_score_threshold() { -0.20000000298023224 ] ], - "userProvided": true + "regenerate": false } }, "_rankingScore": 0.39060014486312866 @@ -409,7 +409,7 @@ async fn ranking_score_threshold() { -0.4000000059604645 ] ], - "userProvided": true + "regenerate": false } }, "_rankingScore": 0.2819308042526245 @@ -441,7 +441,7 @@ async fn ranking_score_threshold() { 0.800000011920929 ] ], - "userProvided": true + "regenerate": false } }, "_rankingScore": 0.890957772731781 @@ -459,7 +459,7 @@ async fn ranking_score_threshold() { -0.20000000298023224 ] ], - "userProvided": true + "regenerate": false } }, "_rankingScore": 0.39060014486312866 @@ -491,7 +491,7 @@ async fn ranking_score_threshold() { 0.800000011920929 ] ], - "userProvided": true + "regenerate": false } }, "_rankingScore": 0.890957772731781 @@ -565,7 +565,7 @@ async fn filter() { -0.20000000298023224 ] ], - "userProvided": true + "regenerate": false } } }, @@ -582,7 +582,7 @@ async fn filter() { -0.4000000059604645 ] ], - "userProvided": true + "regenerate": false } } }, @@ -599,7 +599,7 @@ async fn filter() { -0.5 ] ], - "userProvided": true + "regenerate": false } } } @@ -629,7 +629,7 @@ async fn filter() { 0.8500000238418579 ] ], - "userProvided": true + "regenerate": false } } } @@ -690,7 +690,7 @@ async fn limit_and_offset() { 0.800000011920929 ] ], - "userProvided": true + "regenerate": false } } } @@ -719,7 +719,7 @@ async fn limit_and_offset() { -0.20000000298023224 ] ], - "userProvided": true + "regenerate": false } } } diff --git a/meilisearch/tests/vector/mod.rs b/meilisearch/tests/vector/mod.rs index 55dc186d5..8d619a15a 100644 --- a/meilisearch/tests/vector/mod.rs +++ b/meilisearch/tests/vector/mod.rs @@ -59,7 +59,7 @@ async fn add_remove_user_provided() { 0.0 ] ], - "userProvided": true + "regenerate": false } } }, @@ -75,7 +75,7 @@ async fn add_remove_user_provided() { 1.0 ] ], - "userProvided": true + "regenerate": false } } } @@ -112,7 +112,7 @@ async fn add_remove_user_provided() { 10.0 ] ], - "userProvided": true + "regenerate": false } } }, @@ -180,8 +180,8 @@ async fn generate_default_user_provided_documents(server: &Server) -> Index { {"id": 0, "name": "kefir", "_vectors": { "manual": [0, 0, 0] }}, {"id": 1, "name": "echo", "_vectors": { "manual": [1, 1, 1] }}, {"id": 2, "name": "billou", "_vectors": { "manual": [[2, 2, 2], [2, 2, 3]] }}, - {"id": 3, "name": "intel", "_vectors": { "manual": { "userProvided": true, "embeddings": [3, 3, 3] }}}, - {"id": 4, "name": "max", "_vectors": { "manual": { "userProvided": true, "embeddings": [[4, 4, 4], [4, 4, 5]] }}}, + {"id": 3, "name": "intel", "_vectors": { "manual": { "regenerate": false, "embeddings": [3, 3, 3] }}}, + {"id": 4, "name": "max", "_vectors": { "manual": { "regenerate": false, "embeddings": [[4, 4, 4], [4, 4, 5]] }}}, ]); let (value, code) = index.add_documents(documents, None).await; snapshot!(code, @"202 Accepted"); diff --git a/meilisearch/tests/vector/settings.rs b/meilisearch/tests/vector/settings.rs index e11f4368f..3fe161f9b 100644 --- a/meilisearch/tests/vector/settings.rs +++ b/meilisearch/tests/vector/settings.rs @@ -79,23 +79,93 @@ async fn reset_embedder_documents() { "results": [ { "id": 0, - "name": "kefir" + "name": "kefir", + "_vectors": { + "manual": { + "embeddings": [ + [ + 0.0, + 0.0, + 0.0 + ] + ], + "regenerate": false + } + } }, { "id": 1, - "name": "echo" + "name": "echo", + "_vectors": { + "manual": { + "embeddings": [ + [ + 1.0, + 1.0, + 1.0 + ] + ], + "regenerate": false + } + } }, { "id": 2, - "name": "billou" + "name": "billou", + "_vectors": { + "manual": { + "embeddings": [ + [ + 2.0, + 2.0, + 2.0 + ], + [ + 2.0, + 2.0, + 3.0 + ] + ], + "regenerate": false + } + } }, { "id": 3, - "name": "intel" + "name": "intel", + "_vectors": { + "manual": { + "embeddings": [ + [ + 3.0, + 3.0, + 3.0 + ] + ], + "regenerate": false + } + } }, { "id": 4, - "name": "max" + "name": "max", + "_vectors": { + "manual": { + "embeddings": [ + [ + 4.0, + 4.0, + 4.0 + ], + [ + 4.0, + 4.0, + 5.0 + ] + ], + "regenerate": false + } + } } ], "offset": 0, From e35ef31738fdf2cd473ce55986c8e99d04966b69 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 13 Jun 2024 14:20:48 +0200 Subject: [PATCH 088/110] Small changes following review --- .../index_documents/extract/extract_vector_points.rs | 12 +++++++++--- milli/src/update/index_documents/transform.rs | 6 +++++- milli/src/update/settings.rs | 1 - milli/src/vector/settings.rs | 7 ------- 4 files changed, 14 insertions(+), 12 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_vector_points.rs b/milli/src/update/index_documents/extract/extract_vector_points.rs index 0a27a28bd..736c21c9f 100644 --- a/milli/src/update/index_documents/extract/extract_vector_points.rs +++ b/milli/src/update/index_documents/extract/extract_vector_points.rs @@ -422,8 +422,11 @@ fn extract_vector_document_diff( VectorStateDelta::NowRemoved } } - // when the vectors are no longer user-provided, - // we generate the prompt unconditionally + // inline to the left is not supposed to be possible because the embedder is not new, so `_vectors` was removed from + // the previous version of the document. + // Manual -> Generated is also not possible without an Inline to the right (which is handled above) + // Generated -> Generated is handled above, so not possible + // As a result, this code is unreachable (_not_generated, VectorState::Generated) => { // Do we keep this document? let document_is_kept = obkv @@ -443,7 +446,10 @@ fn extract_vector_document_diff( VectorStateDelta::NowRemoved } } - (_old, VectorState::Manual) => { + // inline to the left is not possible because the embedder is not new, and so `_vectors` was removed from the previous + // version of the document. + // however the Rust type system cannot know that. + (_manual, VectorState::Manual) => { // Do we keep this document? let document_is_kept = obkv .iter() diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 467a2810a..997ab64ff 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -866,6 +866,7 @@ impl<'a, 'i> Transform<'a, 'i> { // The operations that we must perform on the different fields. let mut operations = HashMap::new(); + let mut error_seen = false; let mut obkv_writer = KvWriter::<_, FieldId>::memory(); 'write_fid: for (id, val) in old_obkv.iter() { @@ -886,7 +887,10 @@ impl<'a, 'i> Transform<'a, 'i> { match existing_vectors { Ok(existing_vectors) => existing_vectors, Err(error) => { - tracing::error!(%error, "Unexpected `_vectors` field that is not a map. Treating as an empty map"); + if !error_seen { + tracing::error!(%error, "Unexpected `_vectors` field that is not a map. Treating as an empty map"); + error_seen = true; + } Default::default() } } diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 5421b64a7..b792cde52 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -1182,7 +1182,6 @@ pub struct InnerIndexSettingsDiff { pub(crate) old: InnerIndexSettings, pub(crate) new: InnerIndexSettings, pub(crate) primary_key_id: Option, - // TODO: compare directly the embedders. pub(crate) embedding_config_updates: BTreeMap, pub(crate) settings_update_only: bool, /// The set of only the additional searchable fields. diff --git a/milli/src/vector/settings.rs b/milli/src/vector/settings.rs index edbed462c..9c7fb09b1 100644 --- a/milli/src/vector/settings.rs +++ b/milli/src/vector/settings.rs @@ -101,13 +101,6 @@ pub struct WriteBackToDocuments { } impl SettingsDiff { - pub fn should_reindex(&self) -> bool { - match self { - SettingsDiff::Remove { .. } | SettingsDiff::Reindex { .. } => true, - SettingsDiff::UpdateWithoutReindex { .. } => false, - } - } - pub fn from_settings(old: EmbeddingSettings, new: Setting) -> Self { match new { Setting::Set(new) => { From 6bf07d969e71b7661970bbcdcef4f2611c3a19dd Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 13 Jun 2024 15:49:42 +0200 Subject: [PATCH 089/110] add failing test --- meilisearch/tests/search/distinct.rs | 61 ++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/meilisearch/tests/search/distinct.rs b/meilisearch/tests/search/distinct.rs index aea98215d..68f7f18e8 100644 --- a/meilisearch/tests/search/distinct.rs +++ b/meilisearch/tests/search/distinct.rs @@ -107,6 +107,39 @@ static DOCUMENTS: Lazy = Lazy::new(|| { ]) }); +static NESTED_DOCUMENTS: Lazy = Lazy::new(|| { + json!([ + { + "id": 1, + "description": "Leather Jacket", + "brand": "Lee Jeans", + "product_id": "123456", + "color": { "main": "Brown", "pattern": "stripped" }, + }, + { + "id": 2, + "description": "Leather Jacket", + "brand": "Lee Jeans", + "product_id": "123456", + "color": { "main": "Black", "pattern": "stripped" }, + }, + { + "id": 3, + "description": "Leather Jacket", + "brand": "Lee Jeans", + "product_id": "123456", + "color": { "main": "Blue", "pattern": "used" }, + }, + { + "id": 4, + "description": "T-Shirt", + "brand": "Nike", + "product_id": "789012", + "color": { "main": "Blue", "pattern": "stripped" }, + } + ]) +}); + static DOCUMENT_PRIMARY_KEY: &str = "id"; static DOCUMENT_DISTINCT_KEY: &str = "product_id"; @@ -239,3 +272,31 @@ async fn distinct_search_with_pagination_no_ranking() { snapshot!(response["totalPages"], @"2"); snapshot!(response["totalHits"], @"6"); } + +#[actix_rt::test] +async fn distinct_at_search_time() { + let server = Server::new().await; + let index = server.index("tamo"); + + let documents = NESTED_DOCUMENTS.clone(); + index.add_documents(documents, Some(DOCUMENT_PRIMARY_KEY)).await; + index.update_settings_filterable_attributes(json!(["color"])).await; + index.wait_task(1).await; + + fn get_hits(response: &Value) -> Vec<&str> { + let hits_array = response["hits"] + .as_array() + .unwrap_or_else(|| panic!("{}", &serde_json::to_string_pretty(&response).unwrap())); + hits_array.iter().map(|h| h[DOCUMENT_DISTINCT_KEY].as_str().unwrap()).collect::>() + } + + let (response, code) = + index.search_post(json!({"page": 0, "hitsPerPage": 2, "distinct": "color.main"})).await; + let hits = get_hits(&response); + snapshot!(code, @"200 OK"); + snapshot!(hits.len(), @"0"); + snapshot!(format!("{:?}", hits), @r#"[]"#); + snapshot!(response["page"], @"0"); + snapshot!(response["totalPages"], @"3"); + snapshot!(response["totalHits"], @"6"); +} From b9b938c902b68c125786f56ddbc7b90087a332c3 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 13 Jun 2024 17:13:36 +0200 Subject: [PATCH 090/110] Change `retrieveVectors` behavior: - when the feature is disabled, documents are never modified - when the feature is enabled and `retrieveVectors` is disabled, `_vectors` is removed from documents - when the feature is enabled and `retrieveVectors` is enabled, vectors from the vectors DB are merged with `_vectors` in documents Additionally `_vectors` is never displayed when the `displayedAttributes` list does not contain either `*` or `_vectors` - fixed an issue where `_vectors` was not injected when all vectors in the dataset where always generated --- meilisearch/src/routes/indexes/documents.rs | 83 +++++++++---------- meilisearch/src/routes/indexes/search.rs | 24 +++--- meilisearch/src/routes/indexes/similar.rs | 12 ++- meilisearch/src/routes/multi_search.rs | 13 +-- meilisearch/src/search.rs | 92 ++++++++++++++++++--- 5 files changed, 150 insertions(+), 74 deletions(-) diff --git a/meilisearch/src/routes/indexes/documents.rs b/meilisearch/src/routes/indexes/documents.rs index bfbe20207..1f413ec7d 100644 --- a/meilisearch/src/routes/indexes/documents.rs +++ b/meilisearch/src/routes/indexes/documents.rs @@ -40,7 +40,7 @@ use crate::extractors::sequential_extractor::SeqHandler; use crate::routes::{ get_task_id, is_dry_run, PaginationView, SummarizedTaskView, PAGINATION_DEFAULT_LIMIT, }; -use crate::search::parse_filter; +use crate::search::{parse_filter, RetrieveVectors}; use crate::Opt; static ACCEPTED_CONTENT_TYPE: Lazy> = Lazy::new(|| { @@ -110,21 +110,20 @@ pub async fn get_document( debug!(parameters = ?params, "Get document"); let index_uid = IndexUid::try_from(index_uid)?; - let GetDocument { fields, retrieve_vectors } = params.into_inner(); + let GetDocument { fields, retrieve_vectors: param_retrieve_vectors } = params.into_inner(); let attributes_to_retrieve = fields.merge_star_and_none(); let features = index_scheduler.features(); - if retrieve_vectors.0 { - features.check_vector("Passing `retrieveVectors` as a parameter")?; - } + let retrieve_vectors = RetrieveVectors::new(param_retrieve_vectors.0, features)?; + analytics.get_fetch_documents( - &DocumentFetchKind::PerDocumentId { retrieve_vectors: retrieve_vectors.0 }, + &DocumentFetchKind::PerDocumentId { retrieve_vectors: param_retrieve_vectors.0 }, &req, ); let index = index_scheduler.index(&index_uid)?; let document = - retrieve_document(&index, &document_id, attributes_to_retrieve, retrieve_vectors.0)?; + retrieve_document(&index, &document_id, attributes_to_retrieve, retrieve_vectors)?; debug!(returns = ?document, "Get document"); Ok(HttpResponse::Ok().json(document)) } @@ -195,11 +194,6 @@ pub async fn documents_by_query_post( let body = body.into_inner(); debug!(parameters = ?body, "Get documents POST"); - let features = index_scheduler.features(); - if body.retrieve_vectors { - features.check_vector("Passing `retrieveVectors` as a parameter")?; - } - analytics.post_fetch_documents( &DocumentFetchKind::Normal { with_filter: body.filter.is_some(), @@ -224,11 +218,6 @@ pub async fn get_documents( let BrowseQueryGet { limit, offset, fields, retrieve_vectors, filter } = params.into_inner(); - let features = index_scheduler.features(); - if retrieve_vectors.0 { - features.check_vector("Passing `retrieveVectors` as a parameter")?; - } - let filter = match filter { Some(f) => match serde_json::from_str(&f) { Ok(v) => Some(v), @@ -266,6 +255,9 @@ fn documents_by_query( let index_uid = IndexUid::try_from(index_uid.into_inner())?; let BrowseQuery { offset, limit, fields, retrieve_vectors, filter } = query; + let features = index_scheduler.features(); + let retrieve_vectors = RetrieveVectors::new(retrieve_vectors, features)?; + let index = index_scheduler.index(&index_uid)?; let (total, documents) = retrieve_documents(&index, offset, limit, filter, fields, retrieve_vectors)?; @@ -608,7 +600,7 @@ fn some_documents<'a, 't: 'a>( index: &'a Index, rtxn: &'t RoTxn, doc_ids: impl IntoIterator + 'a, - retrieve_vectors: bool, + retrieve_vectors: RetrieveVectors, ) -> Result> + 'a, ResponseError> { let fields_ids_map = index.fields_ids_map(rtxn)?; let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect(); @@ -617,24 +609,32 @@ fn some_documents<'a, 't: 'a>( Ok(index.iter_documents(rtxn, doc_ids)?.map(move |ret| { ret.map_err(ResponseError::from).and_then(|(key, document)| -> Result<_, ResponseError> { let mut document = milli::obkv_to_json(&all_fields, &fields_ids_map, document)?; - - if retrieve_vectors { - let mut vectors = serde_json::Map::new(); - for (name, vector) in index.embeddings(rtxn, key)? { - let user_provided = embedding_configs - .iter() - .find(|conf| conf.name == name) - .is_some_and(|conf| conf.user_provided.contains(key)); - let embeddings = ExplicitVectors { - embeddings: Some(vector.into()), - regenerate: !user_provided, - }; - vectors.insert( - name, - serde_json::to_value(embeddings).map_err(MeilisearchHttpError::from)?, - ); + match retrieve_vectors { + RetrieveVectors::Ignore => {} + RetrieveVectors::Hide => { + document.remove("_vectors"); + } + RetrieveVectors::Retrieve => { + let mut vectors = match document.remove("_vectors") { + Some(Value::Object(map)) => map, + _ => Default::default(), + }; + for (name, vector) in index.embeddings(rtxn, key)? { + let user_provided = embedding_configs + .iter() + .find(|conf| conf.name == name) + .is_some_and(|conf| conf.user_provided.contains(key)); + let embeddings = ExplicitVectors { + embeddings: Some(vector.into()), + regenerate: !user_provided, + }; + vectors.insert( + name, + serde_json::to_value(embeddings).map_err(MeilisearchHttpError::from)?, + ); + } + document.insert("_vectors".into(), vectors.into()); } - document.insert("_vectors".into(), vectors.into()); } Ok(document) @@ -648,7 +648,7 @@ fn retrieve_documents>( limit: usize, filter: Option, attributes_to_retrieve: Option>, - retrieve_vectors: bool, + retrieve_vectors: RetrieveVectors, ) -> Result<(u64, Vec), ResponseError> { let rtxn = index.read_txn()?; let filter = &filter; @@ -688,10 +688,9 @@ fn retrieve_documents>( Ok(match &attributes_to_retrieve { Some(attributes_to_retrieve) => permissive_json_pointer::select_values( &document?, - attributes_to_retrieve - .iter() - .map(|s| s.as_ref()) - .chain(retrieve_vectors.then_some("_vectors")), + attributes_to_retrieve.iter().map(|s| s.as_ref()).chain( + (retrieve_vectors == RetrieveVectors::Retrieve).then_some("_vectors"), + ), ), None => document?, }) @@ -705,7 +704,7 @@ fn retrieve_document>( index: &Index, doc_id: &str, attributes_to_retrieve: Option>, - retrieve_vectors: bool, + retrieve_vectors: RetrieveVectors, ) -> Result { let txn = index.read_txn()?; @@ -724,7 +723,7 @@ fn retrieve_document>( attributes_to_retrieve .iter() .map(|s| s.as_ref()) - .chain(retrieve_vectors.then_some("_vectors")), + .chain((retrieve_vectors == RetrieveVectors::Retrieve).then_some("_vectors")), ), None => document, }; diff --git a/meilisearch/src/routes/indexes/search.rs b/meilisearch/src/routes/indexes/search.rs index 6fdff4568..421cf2940 100644 --- a/meilisearch/src/routes/indexes/search.rs +++ b/meilisearch/src/routes/indexes/search.rs @@ -20,9 +20,9 @@ use crate::extractors::sequential_extractor::SeqHandler; use crate::metrics::MEILISEARCH_DEGRADED_SEARCH_REQUESTS; use crate::search::{ add_search_rules, perform_search, HybridQuery, MatchingStrategy, RankingScoreThreshold, - SearchKind, SearchQuery, SemanticRatio, DEFAULT_CROP_LENGTH, DEFAULT_CROP_MARKER, - DEFAULT_HIGHLIGHT_POST_TAG, DEFAULT_HIGHLIGHT_PRE_TAG, DEFAULT_SEARCH_LIMIT, - DEFAULT_SEARCH_OFFSET, DEFAULT_SEMANTIC_RATIO, + RetrieveVectors, SearchKind, SearchQuery, SemanticRatio, DEFAULT_CROP_LENGTH, + DEFAULT_CROP_MARKER, DEFAULT_HIGHLIGHT_POST_TAG, DEFAULT_HIGHLIGHT_PRE_TAG, + DEFAULT_SEARCH_LIMIT, DEFAULT_SEARCH_OFFSET, DEFAULT_SEMANTIC_RATIO, }; use crate::search_queue::SearchQueue; @@ -225,10 +225,12 @@ pub async fn search_with_url_query( let features = index_scheduler.features(); let search_kind = search_kind(&query, index_scheduler.get_ref(), &index, features)?; - + let retrieve_vector = RetrieveVectors::new(query.retrieve_vectors, features)?; let _permit = search_queue.try_get_search_permit().await?; - let search_result = - tokio::task::spawn_blocking(move || perform_search(&index, query, search_kind)).await?; + let search_result = tokio::task::spawn_blocking(move || { + perform_search(&index, query, search_kind, retrieve_vector) + }) + .await?; if let Ok(ref search_result) = search_result { aggregate.succeed(search_result); } @@ -265,10 +267,13 @@ pub async fn search_with_post( let features = index_scheduler.features(); let search_kind = search_kind(&query, index_scheduler.get_ref(), &index, features)?; + let retrieve_vectors = RetrieveVectors::new(query.retrieve_vectors, features)?; let _permit = search_queue.try_get_search_permit().await?; - let search_result = - tokio::task::spawn_blocking(move || perform_search(&index, query, search_kind)).await?; + let search_result = tokio::task::spawn_blocking(move || { + perform_search(&index, query, search_kind, retrieve_vectors) + }) + .await?; if let Ok(ref search_result) = search_result { aggregate.succeed(search_result); if search_result.degraded { @@ -295,9 +300,6 @@ pub fn search_kind( if query.hybrid.is_some() { features.check_vector("Passing `hybrid` as a parameter")?; } - if query.retrieve_vectors { - features.check_vector("Passing `retrieveVectors` as a parameter")?; - } // regardless of anything, always do a keyword search when we don't have a vector and the query is whitespace or missing if query.vector.is_none() { diff --git a/meilisearch/src/routes/indexes/similar.rs b/meilisearch/src/routes/indexes/similar.rs index 54ea912ec..1dd83b09b 100644 --- a/meilisearch/src/routes/indexes/similar.rs +++ b/meilisearch/src/routes/indexes/similar.rs @@ -17,8 +17,8 @@ use crate::analytics::{Analytics, SimilarAggregator}; use crate::extractors::authentication::GuardedData; use crate::extractors::sequential_extractor::SeqHandler; use crate::search::{ - add_search_rules, perform_similar, RankingScoreThresholdSimilar, SearchKind, SimilarQuery, - SimilarResult, DEFAULT_SEARCH_LIMIT, DEFAULT_SEARCH_OFFSET, + add_search_rules, perform_similar, RankingScoreThresholdSimilar, RetrieveVectors, SearchKind, + SimilarQuery, SimilarResult, DEFAULT_SEARCH_LIMIT, DEFAULT_SEARCH_OFFSET, }; pub fn configure(cfg: &mut web::ServiceConfig) { @@ -93,6 +93,8 @@ async fn similar( features.check_vector("Using the similar API")?; + let retrieve_vectors = RetrieveVectors::new(query.retrieve_vectors, features)?; + // Tenant token search_rules. if let Some(search_rules) = index_scheduler.filters().get_index_search_rules(&index_uid) { add_search_rules(&mut query.filter, search_rules); @@ -103,8 +105,10 @@ async fn similar( let (embedder_name, embedder) = SearchKind::embedder(&index_scheduler, &index, query.embedder.as_deref(), None)?; - tokio::task::spawn_blocking(move || perform_similar(&index, query, embedder_name, embedder)) - .await? + tokio::task::spawn_blocking(move || { + perform_similar(&index, query, embedder_name, embedder, retrieve_vectors) + }) + .await? } #[derive(Debug, deserr::Deserr)] diff --git a/meilisearch/src/routes/multi_search.rs b/meilisearch/src/routes/multi_search.rs index a83dc4bc0..1d697dac6 100644 --- a/meilisearch/src/routes/multi_search.rs +++ b/meilisearch/src/routes/multi_search.rs @@ -15,7 +15,7 @@ use crate::extractors::authentication::{AuthenticationError, GuardedData}; use crate::extractors::sequential_extractor::SeqHandler; use crate::routes::indexes::search::search_kind; use crate::search::{ - add_search_rules, perform_search, SearchQueryWithIndex, SearchResultWithIndex, + add_search_rules, perform_search, RetrieveVectors, SearchQueryWithIndex, SearchResultWithIndex, }; use crate::search_queue::SearchQueue; @@ -83,11 +83,14 @@ pub async fn multi_search_with_post( let search_kind = search_kind(&query, index_scheduler.get_ref(), &index, features) .with_index(query_index)?; + let retrieve_vector = + RetrieveVectors::new(query.retrieve_vectors, features).with_index(query_index)?; - let search_result = - tokio::task::spawn_blocking(move || perform_search(&index, query, search_kind)) - .await - .with_index(query_index)?; + let search_result = tokio::task::spawn_blocking(move || { + perform_search(&index, query, search_kind, retrieve_vector) + }) + .await + .with_index(query_index)?; search_results.push(SearchResultWithIndex { index_uid: index_uid.into_inner(), diff --git a/meilisearch/src/search.rs b/meilisearch/src/search.rs index 60f684ede..9632e3f5d 100644 --- a/meilisearch/src/search.rs +++ b/meilisearch/src/search.rs @@ -823,6 +823,7 @@ pub fn perform_search( index: &Index, query: SearchQuery, search_kind: SearchKind, + retrieve_vectors: RetrieveVectors, ) -> Result { let before_search = Instant::now(); let rtxn = index.read_txn()?; @@ -860,7 +861,8 @@ pub fn perform_search( page, hits_per_page, attributes_to_retrieve, - retrieve_vectors, + // use the enum passed as parameter + retrieve_vectors: _, attributes_to_crop, crop_length, attributes_to_highlight, @@ -968,7 +970,7 @@ pub fn perform_search( struct AttributesFormat { attributes_to_retrieve: Option>, - retrieve_vectors: bool, + retrieve_vectors: RetrieveVectors, attributes_to_highlight: Option>, attributes_to_crop: Option>, crop_length: usize, @@ -981,6 +983,36 @@ struct AttributesFormat { show_ranking_score_details: bool, } +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum RetrieveVectors { + /// Do not touch the `_vectors` field + /// + /// this is the behavior when the vectorStore feature is disabled + Ignore, + /// Remove the `_vectors` field + /// + /// this is the behavior when the vectorStore feature is enabled, and `retrieveVectors` is `false` + Hide, + /// Retrieve vectors from the DB and merge them into the `_vectors` field + /// + /// this is the behavior when the vectorStore feature is enabled, and `retrieveVectors` is `true` + Retrieve, +} + +impl RetrieveVectors { + pub fn new( + retrieve_vector: bool, + features: index_scheduler::RoFeatures, + ) -> Result { + match (retrieve_vector, features.check_vector("Passing `retrieveVectors` as a parameter")) { + (true, Ok(())) => Ok(Self::Retrieve), + (true, Err(error)) => Err(error), + (false, Ok(())) => Ok(Self::Hide), + (false, Err(_)) => Ok(Self::Ignore), + } + } +} + fn make_hits( index: &Index, rtxn: &RoTxn<'_>, @@ -990,10 +1022,32 @@ fn make_hits( document_scores: Vec>, ) -> Result, MeilisearchHttpError> { let fields_ids_map = index.fields_ids_map(rtxn).unwrap(); - let displayed_ids = index - .displayed_fields_ids(rtxn)? - .map(|fields| fields.into_iter().collect::>()) - .unwrap_or_else(|| fields_ids_map.iter().map(|(id, _)| id).collect()); + let displayed_ids = + index.displayed_fields_ids(rtxn)?.map(|fields| fields.into_iter().collect::>()); + + let vectors_fid = fields_ids_map.id(milli::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME); + + let vectors_is_hidden = match (&displayed_ids, vectors_fid) { + // displayed_ids is a wildcard, so `_vectors` can be displayed regardless of its fid + (None, _) => false, + // displayed_ids is a finite list, and `_vectors` cannot be part of it because it is not an existing field + (Some(_), None) => true, + // displayed_ids is a finit list, so hide if `_vectors` is not part of it + (Some(map), Some(vectors_fid)) => map.contains(&vectors_fid), + }; + + let retrieve_vectors = if let RetrieveVectors::Retrieve = format.retrieve_vectors { + if vectors_is_hidden { + RetrieveVectors::Hide + } else { + RetrieveVectors::Retrieve + } + } else { + format.retrieve_vectors + }; + + let displayed_ids = + displayed_ids.unwrap_or_else(|| fields_ids_map.iter().map(|(id, _)| id).collect()); let fids = |attrs: &BTreeSet| { let mut ids = BTreeSet::new(); for attr in attrs { @@ -1016,9 +1070,7 @@ fn make_hits( .intersection(&displayed_ids) .cloned() .collect(); - let is_vectors_displayed = - fields_ids_map.id("_vectors").is_some_and(|fid| displayed_ids.contains(&fid)); - let retrieve_vectors = format.retrieve_vectors && is_vectors_displayed; + let attr_to_highlight = format.attributes_to_highlight.unwrap_or_default(); let attr_to_crop = format.attributes_to_crop.unwrap_or_default(); let formatted_options = compute_formatted_options( @@ -1058,15 +1110,30 @@ fn make_hits( // First generate a document with all the displayed fields let displayed_document = make_document(&displayed_ids, &fields_ids_map, obkv)?; + let add_vectors_fid = + vectors_fid.filter(|_fid| retrieve_vectors == RetrieveVectors::Retrieve); + // select the attributes to retrieve let attributes_to_retrieve = to_retrieve_ids .iter() + // skip the vectors_fid if RetrieveVectors::Hide + .filter(|fid| match vectors_fid { + Some(vectors_fid) => { + !(retrieve_vectors == RetrieveVectors::Hide && **fid == vectors_fid) + } + None => true, + }) + // need to retrieve the existing `_vectors` field if the `RetrieveVectors::Retrieve` + .chain(add_vectors_fid.iter()) .map(|&fid| fields_ids_map.name(fid).expect("Missing field name")); let mut document = permissive_json_pointer::select_values(&displayed_document, attributes_to_retrieve); - if retrieve_vectors { - let mut vectors = serde_json::Map::new(); + if retrieve_vectors == RetrieveVectors::Retrieve { + let mut vectors = match document.remove("_vectors") { + Some(Value::Object(map)) => map, + _ => Default::default(), + }; for (name, vector) in index.embeddings(rtxn, id)? { let user_provided = embedding_configs .iter() @@ -1148,6 +1215,7 @@ pub fn perform_similar( query: SimilarQuery, embedder_name: String, embedder: Arc, + retrieve_vectors: RetrieveVectors, ) -> Result { let before_search = Instant::now(); let rtxn = index.read_txn()?; @@ -1159,7 +1227,7 @@ pub fn perform_similar( filter: _, embedder: _, attributes_to_retrieve, - retrieve_vectors, + retrieve_vectors: _, show_ranking_score, show_ranking_score_details, ranking_score_threshold, From 09d9b63e1c0c1369e2c92b66e329d21e837f49d3 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 13 Jun 2024 17:16:41 +0200 Subject: [PATCH 091/110] - test case where all vectors were generated - update tests following changes in behavior from previous commit --- meilisearch/tests/search/hybrid.rs | 82 ++++++++++++++++++++++++++++ meilisearch/tests/vector/settings.rs | 82 ++++++++++++++-------------- 2 files changed, 124 insertions(+), 40 deletions(-) diff --git a/meilisearch/tests/search/hybrid.rs b/meilisearch/tests/search/hybrid.rs index be6e0b1c8..31b2940d8 100644 --- a/meilisearch/tests/search/hybrid.rs +++ b/meilisearch/tests/search/hybrid.rs @@ -482,3 +482,85 @@ async fn query_combination() { snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":0.9242424242424242}]"###); snapshot!(response["semanticHitCount"], @"0"); } + +#[actix_rt::test] +async fn retrieve_vectors() { + let server = Server::new().await; + let index = index_with_documents_hf(&server, &SIMPLE_SEARCH_DOCUMENTS).await; + + let (response, code) = index + .search_post( + json!({"q": "Captain", "hybrid": {"semanticRatio": 0.2}, "retrieveVectors": true}), + ) + .await; + snapshot!(code, @"200 OK"); + insta::assert_json_snapshot!(response["hits"], {"[]._vectors.default.embeddings" => "[vectors]"}, @r###" + [ + { + "title": "Captain Planet", + "desc": "He's not part of the Marvel Cinematic Universe", + "id": "2", + "_vectors": { + "default": { + "embeddings": "[vectors]", + "regenerate": true + } + } + }, + { + "title": "Captain Marvel", + "desc": "a Shazam ersatz", + "id": "3", + "_vectors": { + "default": { + "embeddings": "[vectors]", + "regenerate": true + } + } + }, + { + "title": "Shazam!", + "desc": "a Captain Marvel ersatz", + "id": "1", + "_vectors": { + "default": { + "embeddings": "[vectors]", + "regenerate": true + } + } + } + ] + "###); + + // remove `_vectors` from displayed attributes + let (response, code) = + index.update_settings(json!({ "displayedAttributes": ["id", "title", "desc"]} )).await; + assert_eq!(202, code, "{:?}", response); + index.wait_task(response.uid()).await; + + let (response, code) = index + .search_post( + json!({"q": "Captain", "hybrid": {"semanticRatio": 0.2}, "retrieveVectors": true}), + ) + .await; + snapshot!(code, @"200 OK"); + insta::assert_json_snapshot!(response["hits"], {"[]._vectors.default.embeddings" => "[vectors]"}, @r###" + [ + { + "title": "Captain Planet", + "desc": "He's not part of the Marvel Cinematic Universe", + "id": "2" + }, + { + "title": "Captain Marvel", + "desc": "a Shazam ersatz", + "id": "3" + }, + { + "title": "Shazam!", + "desc": "a Captain Marvel ersatz", + "id": "1" + } + ] + "###); +} diff --git a/meilisearch/tests/vector/settings.rs b/meilisearch/tests/vector/settings.rs index 3fe161f9b..e53ceb383 100644 --- a/meilisearch/tests/vector/settings.rs +++ b/meilisearch/tests/vector/settings.rs @@ -73,7 +73,48 @@ async fn reset_embedder_documents() { server.wait_task(response.uid()).await; // Make sure the documents are still present - let (documents, _code) = index.get_all_documents(Default::default()).await; + let (documents, _code) = index + .get_all_documents(GetAllDocumentsOptions { + limit: None, + offset: None, + retrieve_vectors: false, + fields: None, + }) + .await; + snapshot!(json_string!(documents), @r###" + { + "results": [ + { + "id": 0, + "name": "kefir" + }, + { + "id": 1, + "name": "echo" + }, + { + "id": 2, + "name": "billou" + }, + { + "id": 3, + "name": "intel" + }, + { + "id": 4, + "name": "max" + } + ], + "offset": 0, + "limit": 20, + "total": 5 + } + "###); + + // Make sure we are still able to retrieve their vectors + let (documents, _code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; snapshot!(json_string!(documents), @r###" { "results": [ @@ -174,45 +215,6 @@ async fn reset_embedder_documents() { } "###); - // Make sure we are still able to retrieve their vectors - let (documents, _code) = index - .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) - .await; - snapshot!(json_string!(documents), @r###" - { - "results": [ - { - "id": 0, - "name": "kefir", - "_vectors": {} - }, - { - "id": 1, - "name": "echo", - "_vectors": {} - }, - { - "id": 2, - "name": "billou", - "_vectors": {} - }, - { - "id": 3, - "name": "intel", - "_vectors": {} - }, - { - "id": 4, - "name": "max", - "_vectors": {} - } - ], - "offset": 0, - "limit": 20, - "total": 5 - } - "###); - // Make sure the arroy DB has been cleared let (documents, _code) = index.search_post(json!({ "vector": [1, 1, 1] })).await; snapshot!(json_string!(documents), @r###" From 0a8f50695eac018f2664d996e024bf33d4e19d6f Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 13 Jun 2024 17:47:44 +0200 Subject: [PATCH 092/110] Fixes for Rust v1.79 --- dump/src/reader/v3/settings.rs | 1 + dump/src/reader/v4/settings.rs | 1 + dump/src/reader/v5/tasks.rs | 1 + milli/Cargo.toml | 4 ++-- milli/src/search/new/logger/visual.rs | 11 ++++------- milli/src/update/index_documents/transform.rs | 4 +--- xtask/Cargo.toml | 2 +- 7 files changed, 11 insertions(+), 13 deletions(-) diff --git a/dump/src/reader/v3/settings.rs b/dump/src/reader/v3/settings.rs index 0027bf4ff..3288bb1e7 100644 --- a/dump/src/reader/v3/settings.rs +++ b/dump/src/reader/v3/settings.rs @@ -152,6 +152,7 @@ impl Settings { } #[derive(Debug, Clone, Deserialize)] +#[allow(dead_code)] // otherwise rustc complains that the fields go unused #[cfg_attr(test, derive(serde::Serialize))] #[serde(deny_unknown_fields)] #[serde(rename_all = "camelCase")] diff --git a/dump/src/reader/v4/settings.rs b/dump/src/reader/v4/settings.rs index 964cd1152..78d9118ff 100644 --- a/dump/src/reader/v4/settings.rs +++ b/dump/src/reader/v4/settings.rs @@ -182,6 +182,7 @@ impl Settings { } } +#[allow(dead_code)] // otherwise rustc complains that the fields go unused #[derive(Debug, Clone, Deserialize)] #[cfg_attr(test, derive(serde::Serialize))] #[serde(deny_unknown_fields)] diff --git a/dump/src/reader/v5/tasks.rs b/dump/src/reader/v5/tasks.rs index 528a870fc..8dfb2d0b0 100644 --- a/dump/src/reader/v5/tasks.rs +++ b/dump/src/reader/v5/tasks.rs @@ -200,6 +200,7 @@ impl std::ops::Deref for IndexUid { } } +#[allow(dead_code)] // otherwise rustc complains that the fields go unused #[derive(Debug)] #[cfg_attr(test, derive(serde::Serialize))] #[cfg_attr(test, serde(rename_all = "camelCase"))] diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 7fba2af1e..a4aa4ef95 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -71,10 +71,10 @@ csv = "1.3.0" candle-core = { version = "0.4.1" } candle-transformers = { version = "0.4.1" } candle-nn = { version = "0.4.1" } -tokenizers = { git = "https://github.com/huggingface/tokenizers.git", tag = "v0.15.2", version = "0.15.2", default_features = false, features = [ +tokenizers = { git = "https://github.com/huggingface/tokenizers.git", tag = "v0.15.2", version = "0.15.2", default-features = false, features = [ "onig", ] } -hf-hub = { git = "https://github.com/dureuill/hf-hub.git", branch = "rust_tls", default_features = false, features = [ +hf-hub = { git = "https://github.com/dureuill/hf-hub.git", branch = "rust_tls", default-features = false, features = [ "online", ] } tiktoken-rs = "0.5.8" diff --git a/milli/src/search/new/logger/visual.rs b/milli/src/search/new/logger/visual.rs index 8df56da89..2bffdd8d9 100644 --- a/milli/src/search/new/logger/visual.rs +++ b/milli/src/search/new/logger/visual.rs @@ -22,7 +22,7 @@ pub enum SearchEvents { RankingRuleStartIteration { ranking_rule_idx: usize, universe_len: u64 }, RankingRuleNextBucket { ranking_rule_idx: usize, universe_len: u64, bucket_len: u64 }, RankingRuleSkipBucket { ranking_rule_idx: usize, bucket_len: u64 }, - RankingRuleEndIteration { ranking_rule_idx: usize, universe_len: u64 }, + RankingRuleEndIteration { ranking_rule_idx: usize }, ExtendResults { new: Vec }, ProximityGraph { graph: RankingRuleGraph }, ProximityPaths { paths: Vec>> }, @@ -123,12 +123,9 @@ impl SearchLogger for VisualSearchLogger { &mut self, ranking_rule_idx: usize, _ranking_rule: &dyn RankingRule, - universe: &RoaringBitmap, + _universe: &RoaringBitmap, ) { - self.events.push(SearchEvents::RankingRuleEndIteration { - ranking_rule_idx, - universe_len: universe.len(), - }); + self.events.push(SearchEvents::RankingRuleEndIteration { ranking_rule_idx }); self.location.pop(); } fn add_to_results(&mut self, docids: &[u32]) { @@ -326,7 +323,7 @@ impl<'ctx> DetailedLoggerFinish<'ctx> { assert!(ranking_rule_idx == self.rr_action_counter.len() - 1); self.write_skip_bucket(bucket_len)?; } - SearchEvents::RankingRuleEndIteration { ranking_rule_idx, universe_len: _ } => { + SearchEvents::RankingRuleEndIteration { ranking_rule_idx } => { assert!(ranking_rule_idx == self.rr_action_counter.len() - 1); self.write_end_iteration()?; } diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 997ab64ff..1dff29a90 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -53,7 +53,6 @@ pub struct Transform<'a, 'i> { fields_ids_map: FieldsIdsMap, indexer_settings: &'a IndexerConfig, - pub autogenerate_docids: bool, pub index_documents_method: IndexDocumentsMethod, available_documents_ids: AvailableDocumentsIds, @@ -107,7 +106,7 @@ impl<'a, 'i> Transform<'a, 'i> { index: &'i Index, indexer_settings: &'a IndexerConfig, index_documents_method: IndexDocumentsMethod, - autogenerate_docids: bool, + _autogenerate_docids: bool, ) -> Result { // We must choose the appropriate merge function for when two or more documents // with the same user id must be merged or fully replaced in the same batch. @@ -141,7 +140,6 @@ impl<'a, 'i> Transform<'a, 'i> { index, fields_ids_map: index.fields_ids_map(wtxn)?, indexer_settings, - autogenerate_docids, available_documents_ids: AvailableDocumentsIds::from_documents_ids(&documents_ids), original_sorter, flattened_sorter, diff --git a/xtask/Cargo.toml b/xtask/Cargo.toml index 562dfddb3..a618b06a5 100644 --- a/xtask/Cargo.toml +++ b/xtask/Cargo.toml @@ -21,7 +21,7 @@ reqwest = { version = "0.11.23", features = [ "stream", "json", "rustls-tls", -], default_features = false } +], default-features = false } serde = { version = "1.0.195", features = ["derive"] } serde_json = "1.0.111" sha2 = "0.10.8" From a8a085442130bb7ff755de6b5d42fdbf8f2241e9 Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 17 Jun 2024 14:30:50 +0200 Subject: [PATCH 093/110] Update meilisearch/src/analytics/segment_analytics.rs --- meilisearch/src/analytics/segment_analytics.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/meilisearch/src/analytics/segment_analytics.rs b/meilisearch/src/analytics/segment_analytics.rs index ebd808b42..aa87a234c 100644 --- a/meilisearch/src/analytics/segment_analytics.rs +++ b/meilisearch/src/analytics/segment_analytics.rs @@ -988,7 +988,6 @@ impl SearchAggregator { "with_geoPoint": sort_with_geo_point, "avg_criteria_number": format!("{:.2}", sort_sum_of_criteria_terms as f64 / sort_total_number_of_criteria as f64), }, - // TODO ask help from María "distinct": distinct, "filter": { "with_geoRadius": filter_with_geo_radius, From d7844a6e4542c7b02c221661ce49d2a2be70a8ed Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 17 Jun 2024 15:37:32 +0200 Subject: [PATCH 094/110] add a bunch of tests on the errors of the distinct at search time --- meilisearch/tests/search/errors.rs | 63 ++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/meilisearch/tests/search/errors.rs b/meilisearch/tests/search/errors.rs index 53d516c44..3f631773e 100644 --- a/meilisearch/tests/search/errors.rs +++ b/meilisearch/tests/search/errors.rs @@ -1072,3 +1072,66 @@ async fn search_on_unknown_field_plus_joker() { ) .await; } + +#[actix_rt::test] +async fn distinct_at_search_time() { + let server = Server::new().await; + let index = server.index("tamo"); + let (task, _) = index.create(None).await; + let task = index.wait_task(task.uid()).await; + snapshot!(task, name: "task-succeed"); + + let (response, code) = + index.search_post(json!({"page": 0, "hitsPerPage": 2, "distinct": "doggo.truc"})).await; + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r###" + { + "message": "Attribute `doggo.truc` is not filterable and thus, cannot be used as distinct attribute. This index does not have configured filterable attributes.", + "code": "invalid_search_distinct", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_distinct" + } + "###); + + let (task, _) = index.update_settings_filterable_attributes(json!(["color", "machin"])).await; + index.wait_task(task.uid()).await; + + let (response, code) = + index.search_post(json!({"page": 0, "hitsPerPage": 2, "distinct": "doggo.truc"})).await; + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r###" + { + "message": "Attribute `doggo.truc` is not filterable and thus, cannot be used as distinct attribute. Available filterable attributes are: `color, machin`.", + "code": "invalid_search_distinct", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_distinct" + } + "###); + + let (task, _) = index.update_settings_displayed_attributes(json!(["color"])).await; + index.wait_task(task.uid()).await; + + let (response, code) = + index.search_post(json!({"page": 0, "hitsPerPage": 2, "distinct": "doggo.truc"})).await; + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r###" + { + "message": "Attribute `doggo.truc` is not filterable and thus, cannot be used as distinct attribute. Available filterable attributes are: `color, <..hidden-attributes>`.", + "code": "invalid_search_distinct", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_distinct" + } + "###); + + let (response, code) = + index.search_post(json!({"page": 0, "hitsPerPage": 2, "distinct": true})).await; + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r###" + { + "message": "Invalid value type at `.distinct`: expected a string, but found a boolean: `true`", + "code": "invalid_search_distinct", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_distinct" + } + "###); +} From 43875e6758ab434c044cf9564852b7789eb88159 Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 17 Jun 2024 15:59:30 +0200 Subject: [PATCH 095/110] fix bug around nested fields --- meilisearch/tests/search/distinct.rs | 24 ++++++++++++++---------- milli/src/search/mod.rs | 2 +- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/meilisearch/tests/search/distinct.rs b/meilisearch/tests/search/distinct.rs index 68f7f18e8..2023c01a8 100644 --- a/meilisearch/tests/search/distinct.rs +++ b/meilisearch/tests/search/distinct.rs @@ -280,23 +280,27 @@ async fn distinct_at_search_time() { let documents = NESTED_DOCUMENTS.clone(); index.add_documents(documents, Some(DOCUMENT_PRIMARY_KEY)).await; - index.update_settings_filterable_attributes(json!(["color"])).await; - index.wait_task(1).await; + let (task, _) = index.update_settings_filterable_attributes(json!(["color.main"])).await; + let task = index.wait_task(task.uid()).await; + snapshot!(task, name: "succeed"); - fn get_hits(response: &Value) -> Vec<&str> { + fn get_hits(response: &Value) -> Vec { let hits_array = response["hits"] .as_array() .unwrap_or_else(|| panic!("{}", &serde_json::to_string_pretty(&response).unwrap())); - hits_array.iter().map(|h| h[DOCUMENT_DISTINCT_KEY].as_str().unwrap()).collect::>() + hits_array + .iter() + .map(|h| h[DOCUMENT_PRIMARY_KEY].as_number().unwrap().to_string()) + .collect::>() } let (response, code) = - index.search_post(json!({"page": 0, "hitsPerPage": 2, "distinct": "color.main"})).await; + index.search_post(json!({"page": 1, "hitsPerPage": 3, "distinct": "color.main"})).await; let hits = get_hits(&response); snapshot!(code, @"200 OK"); - snapshot!(hits.len(), @"0"); - snapshot!(format!("{:?}", hits), @r#"[]"#); - snapshot!(response["page"], @"0"); - snapshot!(response["totalPages"], @"3"); - snapshot!(response["totalHits"], @"6"); + snapshot!(hits.len(), @"3"); + snapshot!(format!("{:?}", hits), @r###"["1", "2", "3"]"###); + snapshot!(response["page"], @"1"); + snapshot!(response["totalPages"], @"1"); + snapshot!(response["totalHits"], @"3"); } diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 922b72d04..bf488f9f0 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -178,7 +178,7 @@ impl<'a> Search<'a> { if let Some(distinct) = &self.distinct { let filterable_fields = ctx.index.filterable_fields(ctx.txn)?; - if !filterable_fields.contains(distinct) { + if !crate::is_faceted(distinct, &filterable_fields) { let (valid_fields, hidden_fields) = ctx.index.remove_hidden_fields(ctx.txn, filterable_fields)?; return Err(Error::UserError(UserError::InvalidDistinctAttribute { From 8ba65e333bd40f8e6e35fd41f8dc90732e6de631 Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 17 Jun 2024 16:50:26 +0200 Subject: [PATCH 096/110] add snapshot files --- .../distinct_at_search_time/succeed.snap | 20 +++++++++++++++++++ .../distinct_at_search_time/task-succeed.snap | 18 +++++++++++++++++ 2 files changed, 38 insertions(+) create mode 100644 meilisearch/tests/search/snapshots/distinct.rs/distinct_at_search_time/succeed.snap create mode 100644 meilisearch/tests/search/snapshots/errors.rs/distinct_at_search_time/task-succeed.snap diff --git a/meilisearch/tests/search/snapshots/distinct.rs/distinct_at_search_time/succeed.snap b/meilisearch/tests/search/snapshots/distinct.rs/distinct_at_search_time/succeed.snap new file mode 100644 index 000000000..1b8190c42 --- /dev/null +++ b/meilisearch/tests/search/snapshots/distinct.rs/distinct_at_search_time/succeed.snap @@ -0,0 +1,20 @@ +--- +source: meilisearch/tests/search/distinct.rs +--- +{ + "uid": 1, + "indexUid": "tamo", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "filterableAttributes": [ + "color.main" + ] + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" +} diff --git a/meilisearch/tests/search/snapshots/errors.rs/distinct_at_search_time/task-succeed.snap b/meilisearch/tests/search/snapshots/errors.rs/distinct_at_search_time/task-succeed.snap new file mode 100644 index 000000000..903e96ffb --- /dev/null +++ b/meilisearch/tests/search/snapshots/errors.rs/distinct_at_search_time/task-succeed.snap @@ -0,0 +1,18 @@ +--- +source: meilisearch/tests/search/errors.rs +--- +{ + "uid": 0, + "indexUid": "tamo", + "status": "succeeded", + "type": "indexCreation", + "canceledBy": null, + "details": { + "primaryKey": null + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" +} From a04041c8f22494886bbad2423291745aa48e48e1 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 19 Jun 2024 16:25:33 +0200 Subject: [PATCH 097/110] Only spawn the pool once --- .../src/update/index_documents/extract/mod.rs | 23 ++++++++++++------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 9da3983fc..2feb85414 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -11,7 +11,7 @@ mod extract_word_position_docids; use std::fs::File; use std::io::BufReader; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; use crossbeam_channel::Sender; use rayon::prelude::*; @@ -32,7 +32,7 @@ use super::helpers::{as_cloneable_grenad, CursorClonableMmap, GrenadParameters}; use super::{helpers, TypedChunk}; use crate::index::IndexEmbeddingConfig; use crate::update::settings::InnerIndexSettingsDiff; -use crate::{FieldId, Result, ThreadPoolNoAbortBuilder}; +use crate::{FieldId, Result, ThreadPoolNoAbort, ThreadPoolNoAbortBuilder}; /// Extract data for each databases from obkv documents in parallel. /// Send data in grenad file over provided Sender. @@ -207,6 +207,18 @@ fn run_extraction_task( }) } +fn request_threads() -> &'static ThreadPoolNoAbort { + static REQUEST_THREADS: OnceLock = OnceLock::new(); + + REQUEST_THREADS.get_or_init(|| { + ThreadPoolNoAbortBuilder::new() + .num_threads(crate::vector::REQUEST_PARALLELISM) + .thread_name(|index| format!("embedding-request-{index}")) + .build() + .unwrap() + }) +} + /// Extract chunked data and send it into lmdb_writer_sx sender: /// - documents fn send_original_documents_data( @@ -219,11 +231,6 @@ fn send_original_documents_data( let original_documents_chunk = original_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?; - let request_threads = ThreadPoolNoAbortBuilder::new() - .num_threads(crate::vector::REQUEST_PARALLELISM) - .thread_name(|index| format!("embedding-request-{index}")) - .build()?; - let index_vectors = (settings_diff.reindex_vectors() || !settings_diff.settings_update_only()) // no point in indexing vectors without embedders && (!settings_diff.new.embedding_configs.inner_as_ref().is_empty()); @@ -256,7 +263,7 @@ fn send_original_documents_data( prompts, indexer, embedder.clone(), - &request_threads, + request_threads(), ) { Ok(results) => Some(results), Err(error) => { From bad28cc9e2f4bdd5dcee61824c43afbd927c71e5 Mon Sep 17 00:00:00 2001 From: curquiza Date: Thu, 20 Jun 2024 10:01:36 +0200 Subject: [PATCH 098/110] Update mini-dashboard 2.14 --- meilisearch/Cargo.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/meilisearch/Cargo.toml b/meilisearch/Cargo.toml index ebcbbd266..75571b535 100644 --- a/meilisearch/Cargo.toml +++ b/meilisearch/Cargo.toml @@ -158,5 +158,5 @@ vietnamese = ["meilisearch-types/vietnamese"] swedish-recomposition = ["meilisearch-types/swedish-recomposition"] [package.metadata.mini-dashboard] -assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.13/build.zip" -sha1 = "e20cc9b390003c6c844f4b8bcc5c5013191a77ff" +assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.14/build.zip" +sha1 = "592d1b5a3459d621d0aae1dded8fe3154f5c38fe" From 19d7cdc20d144fdd62284c975e9be5ae370aae9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 20 Jun 2024 12:57:08 +0200 Subject: [PATCH 099/110] Improve facet distribution speed in lexico mode --- .../search/facet/facet_distribution_iter.rs | 26 ++++++++++++------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/milli/src/search/facet/facet_distribution_iter.rs b/milli/src/search/facet/facet_distribution_iter.rs index a8aa1a006..26b4ae80e 100644 --- a/milli/src/search/facet/facet_distribution_iter.rs +++ b/milli/src/search/facet/facet_distribution_iter.rs @@ -1,14 +1,17 @@ use std::cmp::Reverse; use std::collections::BinaryHeap; +use std::io::Cursor; use std::ops::ControlFlow; use heed::Result; use roaring::RoaringBitmap; use super::{get_first_facet_value, get_highest_level}; -use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec}; +use crate::heed_codec::facet::{ + FacetGroupKey, FacetGroupKeyCodec, FacetGroupLazyValueCodec, FacetGroupValueCodec, +}; use crate::heed_codec::BytesRefCodec; -use crate::DocumentId; +use crate::{CboRoaringBitmapCodec, DocumentId}; /// Call the given closure on the facet distribution of the candidate documents. /// @@ -31,12 +34,9 @@ pub fn lexicographically_iterate_over_facet_distribution<'t, CB>( where CB: FnMut(&'t [u8], u64, DocumentId) -> Result>, { + let db = db.remap_data_type::(); let mut fd = LexicographicFacetDistribution { rtxn, db, field_id, callback }; - let highest_level = get_highest_level( - rtxn, - db.remap_key_type::>(), - field_id, - )?; + let highest_level = get_highest_level(rtxn, db, field_id)?; if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { fd.iterate(candidates, highest_level, first_bound, usize::MAX)?; @@ -146,7 +146,7 @@ where CB: FnMut(&'t [u8], u64, DocumentId) -> Result>, { rtxn: &'t heed::RoTxn<'t>, - db: heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupLazyValueCodec>, field_id: u16, callback: CB, } @@ -171,7 +171,10 @@ where if key.field_id != self.field_id { return Ok(ControlFlow::Break(())); } - let docids_in_common = value.bitmap & candidates; + let docids_in_common = CboRoaringBitmapCodec::intersection_with_serialized( + value.bitmap_bytes, + candidates, + )?; if !docids_in_common.is_empty() { let any_docid_in_common = docids_in_common.min().unwrap(); match (self.callback)(key.left_bound, docids_in_common.len(), any_docid_in_common)? @@ -205,7 +208,10 @@ where if key.field_id != self.field_id { return Ok(ControlFlow::Break(())); } - let docids_in_common = value.bitmap & candidates; + let docids_in_common = CboRoaringBitmapCodec::intersection_with_serialized( + value.bitmap_bytes, + candidates, + )?; if !docids_in_common.is_empty() { let cf = self.iterate( &docids_in_common, From 6fa4da8ae7cf7df6f78d9f11411a6a81b6625bc1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 20 Jun 2024 12:58:51 +0200 Subject: [PATCH 100/110] Improve facet distribution speed in count mode --- .../search/facet/facet_distribution_iter.rs | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/milli/src/search/facet/facet_distribution_iter.rs b/milli/src/search/facet/facet_distribution_iter.rs index 26b4ae80e..1e6ea8d88 100644 --- a/milli/src/search/facet/facet_distribution_iter.rs +++ b/milli/src/search/facet/facet_distribution_iter.rs @@ -1,6 +1,5 @@ use std::cmp::Reverse; use std::collections::BinaryHeap; -use std::io::Cursor; use std::ops::ControlFlow; use heed::Result; @@ -75,11 +74,8 @@ where // Represents the list of keys that we must explore. let mut heap = BinaryHeap::new(); - let highest_level = get_highest_level( - rtxn, - db.remap_key_type::>(), - field_id, - )?; + let db = db.remap_data_type::(); + let highest_level = get_highest_level(rtxn, db, field_id)?; if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { // We first fill the heap with values from the highest level @@ -92,7 +88,10 @@ where if key.field_id != field_id { break; } - let intersection = value.bitmap & candidates; + let intersection = CboRoaringBitmapCodec::intersection_with_serialized( + value.bitmap_bytes, + candidates, + )?; let count = intersection.len(); if count != 0 { heap.push(LevelEntry { @@ -121,7 +120,10 @@ where if key.field_id != field_id { break; } - let intersection = value.bitmap & candidates; + let intersection = CboRoaringBitmapCodec::intersection_with_serialized( + value.bitmap_bytes, + candidates, + )?; let count = intersection.len(); if count != 0 { heap.push(LevelEntry { From 9736e16a88868f352eda3605a723f04ba60d1b7c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 20 Jun 2024 13:02:44 +0200 Subject: [PATCH 101/110] Make clippy happy --- .../update/index_documents/extract/extract_vector_points.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_vector_points.rs b/milli/src/update/index_documents/extract/extract_vector_points.rs index 736c21c9f..36fa346a5 100644 --- a/milli/src/update/index_documents/extract/extract_vector_points.rs +++ b/milli/src/update/index_documents/extract/extract_vector_points.rs @@ -290,7 +290,7 @@ pub fn extract_vector_points( regenerate_if_prompt_changed( obkv, (old_prompt, prompt), - (&old_fields_ids_map, &new_fields_ids_map), + (old_fields_ids_map, new_fields_ids_map), )? } else { // we can simply ignore user provided vectors as they are not regenerated and are @@ -306,7 +306,7 @@ pub fn extract_vector_points( prompt, (add_to_user_provided, remove_from_user_provided), (old, new), - (&old_fields_ids_map, &new_fields_ids_map), + (old_fields_ids_map, new_fields_ids_map), document_id, )?, }; From 1693332cab0c209d5729233ddceae4e9bc483ea5 Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 20 Jun 2024 15:59:32 +0200 Subject: [PATCH 102/110] Update arroy and always build the tree that need to be built --- Cargo.lock | 24 ++++---- index-scheduler/Cargo.toml | 2 +- index-scheduler/src/lib.rs | 2 +- meilisearch/tests/common/mod.rs | 2 +- meilisearch/tests/vector/mod.rs | 82 +++++++++++++++++++++++++ milli/Cargo.toml | 2 +- milli/src/error.rs | 5 +- milli/src/index.rs | 4 +- milli/src/update/index_documents/mod.rs | 5 +- 9 files changed, 106 insertions(+), 22 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 2a5960502..3c728f348 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -381,9 +381,9 @@ dependencies = [ [[package]] name = "arroy" -version = "0.3.1" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73897699bf04bac935c0b120990d2a511e91e563e0f9769f9c8bb983d98dfbc9" +checksum = "2ece9e5347e7fdaaea3181dec7f916677ad5f3fcbac183648ce1924eb4aeef9a" dependencies = [ "bytemuck", "byteorder", @@ -679,9 +679,9 @@ checksum = "2c676a478f63e9fa2dd5368a42f28bba0d6c560b775f38583c8bbaa7fcd67c9c" [[package]] name = "bytemuck" -version = "1.15.0" +version = "1.16.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d6d68c57235a3a081186990eca2867354726650f42f7516ca50c28d6281fd15" +checksum = "b236fc92302c97ed75b38da1f4917b5cdda4984745740f153a5d3059e48d725e" dependencies = [ "bytemuck_derive", ] @@ -2273,9 +2273,9 @@ checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" [[package]] name = "heed" -version = "0.20.1" +version = "0.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6f7acb9683d7c7068aa46d47557bfa4e35a277964b350d9504a87b03610163fd" +checksum = "f60d7cff16094be9627830b399c087a25017e93fb3768b87cd656a68ccb1ebe8" dependencies = [ "bitflags 2.5.0", "byteorder", @@ -3172,9 +3172,9 @@ checksum = "f9d642685b028806386b2b6e75685faadd3eb65a85fff7df711ce18446a422da" [[package]] name = "lmdb-master-sys" -version = "0.2.0" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc9048db3a58c0732d7236abc4909058f9d2708cfb6d7d047eb895fddec6419a" +checksum = "a5142795c220effa4c8f4813537bd4c88113a07e45e93100ccb2adc5cec6c7f3" dependencies = [ "cc", "doxygen-rs", @@ -5053,18 +5053,18 @@ dependencies = [ [[package]] name = "thiserror" -version = "1.0.58" +version = "1.0.61" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03468839009160513471e86a034bb2c5c0e4baae3b43f79ffc55c4a5427b3297" +checksum = "c546c80d6be4bc6a00c0f01730c08df82eaa7a7a61f11d656526506112cc1709" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.58" +version = "1.0.61" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c61f3ba182994efc43764a46c018c347bc492c79f024e705f46567b418f6d4f7" +checksum = "46c3384250002a6d5af4d114f2845d37b57521033f30d5c3f46c4d70e1197533" dependencies = [ "proc-macro2", "quote", diff --git a/index-scheduler/Cargo.toml b/index-scheduler/Cargo.toml index 8959bb070..aff3b379f 100644 --- a/index-scheduler/Cargo.toml +++ b/index-scheduler/Cargo.toml @@ -40,7 +40,7 @@ ureq = "2.9.7" uuid = { version = "1.6.1", features = ["serde", "v4"] } [dev-dependencies] -arroy = "0.3.1" +arroy = "0.4.0" big_s = "1.0.2" crossbeam = "0.8.4" insta = { version = "1.34.0", features = ["json", "redactions"] } diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index 88997b715..213ec3230 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -5396,7 +5396,7 @@ mod tests { let reader = arroy::Reader::open(&rtxn, i as u16, index.vector_arroy) .map(Some) .or_else(|e| match e { - arroy::Error::MissingMetadata => Ok(None), + arroy::Error::MissingMetadata(_) => Ok(None), e => Err(e), }) .transpose(); diff --git a/meilisearch/tests/common/mod.rs b/meilisearch/tests/common/mod.rs index 317e5e171..4476e0d1f 100644 --- a/meilisearch/tests/common/mod.rs +++ b/meilisearch/tests/common/mod.rs @@ -65,7 +65,7 @@ impl Display for Value { write!( f, "{}", - json_string!(self, { ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]", ".duration" => "[duration]" }) + json_string!(self, { ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]", ".duration" => "[duration]", ".processingTimeMs" => "[duration]" }) ) } } diff --git a/meilisearch/tests/vector/mod.rs b/meilisearch/tests/vector/mod.rs index 8d619a15a..53b2cca76 100644 --- a/meilisearch/tests/vector/mod.rs +++ b/meilisearch/tests/vector/mod.rs @@ -225,3 +225,85 @@ async fn clear_documents() { } "###); } + +#[actix_rt::test] +async fn add_remove_one_vector_4588() { + // https://github.com/meilisearch/meilisearch/issues/4588 + let server = Server::new().await; + let index = server.index("doggo"); + let (value, code) = server.set_features(json!({"vectorStore": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(value, @r###" + { + "vectorStore": true, + "metrics": false, + "logsRoute": false + } + "###); + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "manual": { + "source": "userProvided", + "dimensions": 3, + } + }, + })) + .await; + snapshot!(code, @"202 Accepted"); + let task = server.wait_task(response.uid()).await; + snapshot!(task, name: "settings-processed"); + + let documents = json!([ + {"id": 0, "name": "kefir", "_vectors": { "manual": [0, 0, 0] }}, + ]); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + let task = index.wait_task(value.uid()).await; + snapshot!(task, name: "document-added"); + + let documents = json!([ + {"id": 0, "name": "kefir", "_vectors": { "manual": null }}, + ]); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + let task = index.wait_task(value.uid()).await; + snapshot!(task, name: "document-deleted"); + + let (documents, _code) = index.search_post(json!({"vector": [1, 1, 1] })).await; + snapshot!(json_string!(documents), @r###" + { + "hits": [ + { + "id": 0, + "name": "kefir" + } + ], + "query": "", + "processingTimeMs": 1, + "limit": 20, + "offset": 0, + "estimatedTotalHits": 1, + "semanticHitCount": 1 + } + "###); + + let (documents, _code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; + snapshot!(json_string!(documents), @r###" + { + "results": [ + { + "id": 0, + "name": "kefir", + "_vectors": {} + } + ], + "offset": 0, + "limit": 20, + "total": 1 + } + "###); +} diff --git a/milli/Cargo.toml b/milli/Cargo.toml index a4aa4ef95..fd7bde99b 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -79,7 +79,7 @@ hf-hub = { git = "https://github.com/dureuill/hf-hub.git", branch = "rust_tls", ] } tiktoken-rs = "0.5.8" liquid = "0.26.4" -arroy = "0.3.1" +arroy = "0.4.0" rand = "0.8.5" tracing = "0.1.40" ureq = { version = "2.9.7", features = ["json"] } diff --git a/milli/src/error.rs b/milli/src/error.rs index 7420ce667..8210d92e0 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -281,8 +281,9 @@ impl From for Error { arroy::Error::DatabaseFull | arroy::Error::InvalidItemAppend | arroy::Error::UnmatchingDistance { .. } - | arroy::Error::MissingNode - | arroy::Error::MissingMetadata => { + | arroy::Error::NeedBuild(_) + | arroy::Error::MissingKey { .. } + | arroy::Error::MissingMetadata(_) => { Error::InternalError(InternalError::ArroyError(value)) } } diff --git a/milli/src/index.rs b/milli/src/index.rs index d325d6fa4..0a7a20ce0 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -1610,7 +1610,7 @@ impl Index { arroy::Reader::open(rtxn, k, self.vector_arroy) .map(Some) .or_else(|e| match e { - arroy::Error::MissingMetadata => Ok(None), + arroy::Error::MissingMetadata(_) => Ok(None), e => Err(e.into()), }) .transpose() @@ -1643,7 +1643,7 @@ impl Index { let reader = arroy::Reader::open(rtxn, embedder_id | (i as u16), self.vector_arroy) .map(Some) .or_else(|e| match e { - arroy::Error::MissingMetadata => Ok(None), + arroy::Error::MissingMetadata(_) => Ok(None), e => Err(e), }) .transpose(); diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 3586c9c6d..089b56025 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -547,10 +547,11 @@ where pool.install(|| { for k in crate::vector::arroy_db_range_for_embedder(embedder_index) { let writer = arroy::Writer::new(vector_arroy, k, dimension); - if writer.is_empty(wtxn)? { + if writer.need_build(wtxn)? { + writer.build(wtxn, &mut rng, None)?; + } else if writer.is_empty(wtxn)? { break; } - writer.build(wtxn, &mut rng, None)?; } Result::Ok(()) }) From 7be17b7e4c37b6fee5062062d3fab815fa21aa01 Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 24 Jun 2024 10:52:57 +0200 Subject: [PATCH 103/110] add the missing snapshots --- .../document-added.snap | 19 +++++++++++++++ .../document-deleted.snap | 19 +++++++++++++++ .../settings-processed.snap | 23 +++++++++++++++++++ 3 files changed, 61 insertions(+) create mode 100644 meilisearch/tests/vector/snapshots/mod.rs/add_remove_one_vector_4588/document-added.snap create mode 100644 meilisearch/tests/vector/snapshots/mod.rs/add_remove_one_vector_4588/document-deleted.snap create mode 100644 meilisearch/tests/vector/snapshots/mod.rs/add_remove_one_vector_4588/settings-processed.snap diff --git a/meilisearch/tests/vector/snapshots/mod.rs/add_remove_one_vector_4588/document-added.snap b/meilisearch/tests/vector/snapshots/mod.rs/add_remove_one_vector_4588/document-added.snap new file mode 100644 index 000000000..52d9ad38d --- /dev/null +++ b/meilisearch/tests/vector/snapshots/mod.rs/add_remove_one_vector_4588/document-added.snap @@ -0,0 +1,19 @@ +--- +source: meilisearch/tests/vector/mod.rs +--- +{ + "uid": 1, + "indexUid": "doggo", + "status": "succeeded", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 1 + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" +} diff --git a/meilisearch/tests/vector/snapshots/mod.rs/add_remove_one_vector_4588/document-deleted.snap b/meilisearch/tests/vector/snapshots/mod.rs/add_remove_one_vector_4588/document-deleted.snap new file mode 100644 index 000000000..de02d0b1d --- /dev/null +++ b/meilisearch/tests/vector/snapshots/mod.rs/add_remove_one_vector_4588/document-deleted.snap @@ -0,0 +1,19 @@ +--- +source: meilisearch/tests/vector/mod.rs +--- +{ + "uid": 2, + "indexUid": "doggo", + "status": "succeeded", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 1 + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" +} diff --git a/meilisearch/tests/vector/snapshots/mod.rs/add_remove_one_vector_4588/settings-processed.snap b/meilisearch/tests/vector/snapshots/mod.rs/add_remove_one_vector_4588/settings-processed.snap new file mode 100644 index 000000000..316305fa8 --- /dev/null +++ b/meilisearch/tests/vector/snapshots/mod.rs/add_remove_one_vector_4588/settings-processed.snap @@ -0,0 +1,23 @@ +--- +source: meilisearch/tests/vector/mod.rs +--- +{ + "uid": 0, + "indexUid": "doggo", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "manual": { + "source": "userProvided", + "dimensions": 3 + } + } + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" +} From 606e108420d9474fe7295d662c4bb4e2c7882f81 Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 24 Jun 2024 11:13:45 +0200 Subject: [PATCH 104/110] fix all the flaky snapshots --- meilisearch/tests/vector/mod.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/meilisearch/tests/vector/mod.rs b/meilisearch/tests/vector/mod.rs index 53b2cca76..4172ef444 100644 --- a/meilisearch/tests/vector/mod.rs +++ b/meilisearch/tests/vector/mod.rs @@ -213,11 +213,11 @@ async fn clear_documents() { // Make sure the arroy DB has been cleared let (documents, _code) = index.search_post(json!({ "vector": [1, 1, 1] })).await; - snapshot!(json_string!(documents), @r###" + snapshot!(documents, @r###" { "hits": [], "query": "", - "processingTimeMs": 0, + "processingTimeMs": "[duration]", "limit": 20, "offset": 0, "estimatedTotalHits": 0, @@ -272,7 +272,7 @@ async fn add_remove_one_vector_4588() { snapshot!(task, name: "document-deleted"); let (documents, _code) = index.search_post(json!({"vector": [1, 1, 1] })).await; - snapshot!(json_string!(documents), @r###" + snapshot!(documents, @r###" { "hits": [ { @@ -281,7 +281,7 @@ async fn add_remove_one_vector_4588() { } ], "query": "", - "processingTimeMs": 1, + "processingTimeMs": "[duration]", "limit": 20, "offset": 0, "estimatedTotalHits": 1, From 1daaed163a90b4550f6e6f4abc1d4e544a9f0630 Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 27 Jun 2024 11:01:52 +0200 Subject: [PATCH 105/110] Make _vectors.:embedding.regenerate mandatory + tests + error messages --- meilisearch-types/src/error.rs | 3 +- meilisearch/tests/vector/mod.rs | 205 +++++++++++++++++++++++++++++ milli/src/error.rs | 2 + milli/src/vector/parsed_vectors.rs | 142 +++++++++++++++++--- 4 files changed, 336 insertions(+), 16 deletions(-) diff --git a/meilisearch-types/src/error.rs b/meilisearch-types/src/error.rs index 086396d7d..f529238e4 100644 --- a/meilisearch-types/src/error.rs +++ b/meilisearch-types/src/error.rs @@ -398,7 +398,8 @@ impl ErrorCode for milli::Error { UserError::CriterionError(_) => Code::InvalidSettingsRankingRules, UserError::InvalidGeoField { .. } => Code::InvalidDocumentGeoField, UserError::InvalidVectorDimensions { .. } => Code::InvalidVectorDimensions, - UserError::InvalidVectorsMapType { .. } => Code::InvalidVectorsType, + UserError::InvalidVectorsMapType { .. } + | UserError::InvalidVectorsEmbedderConf { .. } => Code::InvalidVectorsType, UserError::TooManyVectors(_, _) => Code::TooManyVectors, UserError::SortError(_) => Code::InvalidSearchSort, UserError::InvalidMinTypoWordLenSetting(_, _) => { diff --git a/meilisearch/tests/vector/mod.rs b/meilisearch/tests/vector/mod.rs index 4172ef444..dcefe2460 100644 --- a/meilisearch/tests/vector/mod.rs +++ b/meilisearch/tests/vector/mod.rs @@ -190,6 +190,211 @@ async fn generate_default_user_provided_documents(server: &Server) -> Index { index } +#[actix_rt::test] +async fn user_provided_embeddings_error() { + let server = Server::new().await; + let index = generate_default_user_provided_documents(&server).await; + + // First case, we forget to specify the `regenerate` + let documents = + json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "embeddings": [0, 0, 0] }}}); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + let task = index.wait_task(value.uid()).await; + snapshot!(task, @r###" + { + "uid": 2, + "indexUid": "doggo", + "status": "failed", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 0 + }, + "error": { + "message": "Bad embedder configuration in the document with id: `\"0\"`. Missing field `regenerate` inside `.manual`", + "code": "invalid_vectors_type", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_vectors_type" + }, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); + + // Second case, we don't specify anything + let documents = json!({"id": 0, "name": "kefir", "_vectors": { "manual": {}}}); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + let task = index.wait_task(value.uid()).await; + snapshot!(task, @r###" + { + "uid": 3, + "indexUid": "doggo", + "status": "failed", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 0 + }, + "error": { + "message": "Bad embedder configuration in the document with id: `\"0\"`. Missing field `regenerate` inside `.manual`", + "code": "invalid_vectors_type", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_vectors_type" + }, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); + + // Third case, we specify something wrong in place of regenerate + let documents = + json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "regenerate": "yes please" }}}); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + let task = index.wait_task(value.uid()).await; + snapshot!(task, @r###" + { + "uid": 4, + "indexUid": "doggo", + "status": "failed", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 0 + }, + "error": { + "message": "Bad embedder configuration in the document with id: `\"0\"`. Invalid value type at `.manual.regenerate`: expected a boolean, but found a string: `\"yes please\"`", + "code": "invalid_vectors_type", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_vectors_type" + }, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); + + let documents = + json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "embeddings": true }}}); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + let task = index.wait_task(value.uid()).await; + snapshot!(task, @r###" + { + "uid": 5, + "indexUid": "doggo", + "status": "failed", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 0 + }, + "error": { + "message": "Bad embedder configuration in the document with id: `\"0\"`. Invalid value type at `.manual.embeddings`: expected null or an array, but found a boolean: `true`", + "code": "invalid_vectors_type", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_vectors_type" + }, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); + + let documents = + json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "embeddings": [true] }}}); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + let task = index.wait_task(value.uid()).await; + snapshot!(task, @r###" + { + "uid": 6, + "indexUid": "doggo", + "status": "failed", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 0 + }, + "error": { + "message": "Bad embedder configuration in the document with id: `\"0\"`. Invalid value type at `.manual.embeddings[0]`: expected a number or an array, but found a boolean: `true`", + "code": "invalid_vectors_type", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_vectors_type" + }, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); + + let documents = + json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "embeddings": [[true]] }}}); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + let task = index.wait_task(value.uid()).await; + snapshot!(task, @r###" + { + "uid": 7, + "indexUid": "doggo", + "status": "failed", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 0 + }, + "error": { + "message": "Bad embedder configuration in the document with id: `\"0\"`. Invalid value type at `.manual.embeddings[0][0]`: expected a number, but found a boolean: `true`", + "code": "invalid_vectors_type", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_vectors_type" + }, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); + + let documents = json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "embeddings": [23, 0.1, -12], "regenerate": true }}}); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + let task = index.wait_task(value.uid()).await; + snapshot!(task, @r###" + { + "uid": 8, + "indexUid": "doggo", + "status": "succeeded", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 1 + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); +} + #[actix_rt::test] async fn clear_documents() { let server = Server::new().await; diff --git a/milli/src/error.rs b/milli/src/error.rs index 8210d92e0..8e03fde4e 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -119,6 +119,8 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco InvalidVectorDimensions { expected: usize, found: usize }, #[error("The `_vectors` field in the document with id: `{document_id}` is not an object. Was expecting an object with a key for each embedder with manually provided vectors, but instead got `{value}`")] InvalidVectorsMapType { document_id: String, value: Value }, + #[error("Bad embedder configuration in the document with id: `{document_id}`. {error}")] + InvalidVectorsEmbedderConf { document_id: String, error: deserr::errors::JsonError }, #[error("{0}")] InvalidFilter(String), #[error("Invalid type for filter subexpression: expected: {}, found: {1}.", .0.join(", "))] diff --git a/milli/src/vector/parsed_vectors.rs b/milli/src/vector/parsed_vectors.rs index 92d6cb382..f934953fd 100644 --- a/milli/src/vector/parsed_vectors.rs +++ b/milli/src/vector/parsed_vectors.rs @@ -1,5 +1,6 @@ use std::collections::{BTreeMap, BTreeSet}; +use deserr::{take_cf_content, DeserializeError, Deserr, Sequence}; use obkv::KvReader; use serde_json::{from_slice, Value}; @@ -10,13 +11,44 @@ use crate::{DocumentId, FieldId, InternalError, UserError}; pub const RESERVED_VECTORS_FIELD_NAME: &str = "_vectors"; -#[derive(serde::Serialize, serde::Deserialize, Debug)] +#[derive(serde::Serialize, Debug)] #[serde(untagged)] pub enum Vectors { ImplicitlyUserProvided(VectorOrArrayOfVectors), Explicit(ExplicitVectors), } +impl Deserr for Vectors { + fn deserialize_from_value( + value: deserr::Value, + location: deserr::ValuePointerRef, + ) -> Result { + match value { + deserr::Value::Sequence(_) | deserr::Value::Null => { + Ok(Vectors::ImplicitlyUserProvided(VectorOrArrayOfVectors::deserialize_from_value( + value, location, + )?)) + } + deserr::Value::Map(_) => { + Ok(Vectors::Explicit(ExplicitVectors::deserialize_from_value(value, location)?)) + } + + value => Err(take_cf_content(E::error( + None, + deserr::ErrorKind::IncorrectValueKind { + actual: value, + accepted: &[ + deserr::ValueKind::Sequence, + deserr::ValueKind::Map, + deserr::ValueKind::Null, + ], + }, + location, + ))), + } + } +} + impl Vectors { pub fn must_regenerate(&self) -> bool { match self { @@ -37,9 +69,11 @@ impl Vectors { } } -#[derive(serde::Serialize, serde::Deserialize, Debug)] +#[derive(serde::Serialize, Deserr, Debug)] #[serde(rename_all = "camelCase")] pub struct ExplicitVectors { + #[serde(default)] + #[deserr(default)] pub embeddings: Option, pub regenerate: bool, } @@ -149,13 +183,20 @@ impl ParsedVectorsDiff { pub struct ParsedVectors(pub BTreeMap); +impl Deserr for ParsedVectors { + fn deserialize_from_value( + value: deserr::Value, + location: deserr::ValuePointerRef, + ) -> Result { + let value = >::deserialize_from_value(value, location)?; + Ok(ParsedVectors(value)) + } +} + impl ParsedVectors { pub fn from_bytes(value: &[u8]) -> Result { - let Ok(value) = from_slice(value) else { - let value = from_slice(value).map_err(Error::InternalSerdeJson)?; - return Err(Error::InvalidMap(value)); - }; - Ok(ParsedVectors(value)) + let value: serde_json::Value = from_slice(value).map_err(Error::InternalSerdeJson)?; + deserr::deserialize(value).map_err(|error| Error::InvalidEmbedderConf { error }) } pub fn retain_not_embedded_vectors(&mut self, embedders: &BTreeSet) { @@ -165,6 +206,7 @@ impl ParsedVectors { pub enum Error { InvalidMap(Value), + InvalidEmbedderConf { error: deserr::errors::JsonError }, InternalSerdeJson(serde_json::Error), } @@ -174,6 +216,12 @@ impl Error { Error::InvalidMap(value) => { crate::Error::UserError(UserError::InvalidVectorsMapType { document_id, value }) } + Error::InvalidEmbedderConf { error } => { + crate::Error::UserError(UserError::InvalidVectorsEmbedderConf { + document_id, + error, + }) + } Error::InternalSerdeJson(error) => { crate::Error::InternalError(InternalError::SerdeJson(error)) } @@ -194,13 +242,73 @@ fn to_vector_map( } /// Represents either a vector or an array of multiple vectors. -#[derive(serde::Serialize, serde::Deserialize, Debug)] +#[derive(serde::Serialize, Debug)] #[serde(transparent)] pub struct VectorOrArrayOfVectors { #[serde(with = "either::serde_untagged_optional")] inner: Option, Embedding>>, } +impl Deserr for VectorOrArrayOfVectors { + fn deserialize_from_value( + value: deserr::Value, + location: deserr::ValuePointerRef, + ) -> Result { + match value { + deserr::Value::Null => Ok(VectorOrArrayOfVectors { inner: None }), + deserr::Value::Sequence(seq) => { + let mut iter = seq.into_iter(); + let location = location.push_index(0); + match iter.next().map(|v| v.into_value()) { + None => { + // With the strange way serde serialize the `Either`, we must send the left part + // otherwise it'll consider we returned [[]] + Ok(VectorOrArrayOfVectors { inner: Some(either::Either::Left(Vec::new())) }) + } + Some(val @ deserr::Value::Sequence(_)) => { + let first = Embedding::deserialize_from_value(val, location)?; + let mut collect = vec![first]; + let mut tail = iter + .map(|v| Embedding::deserialize_from_value(v.into_value(), location)) + .collect::, _>>()?; + collect.append(&mut tail); + + Ok(VectorOrArrayOfVectors { inner: Some(either::Either::Left(collect)) }) + } + Some( + val @ deserr::Value::Integer(_) + | val @ deserr::Value::NegativeInteger(_) + | val @ deserr::Value::Float(_), + ) => { + let first = ::deserialize_from_value(val, location)?; + let mut embedding = iter + .map(|v| ::deserialize_from_value(v.into_value(), location)) + .collect::, _>>()?; + embedding.insert(0, first); + Ok(VectorOrArrayOfVectors { inner: Some(either::Either::Right(embedding)) }) + } + Some(value) => Err(take_cf_content(E::error( + None, + deserr::ErrorKind::IncorrectValueKind { + actual: value, + accepted: &[deserr::ValueKind::Sequence, deserr::ValueKind::Float], + }, + location, + ))), + } + } + value => Err(take_cf_content(E::error( + None, + deserr::ErrorKind::IncorrectValueKind { + actual: value, + accepted: &[deserr::ValueKind::Sequence, deserr::ValueKind::Null], + }, + location, + ))), + } + } +} + impl VectorOrArrayOfVectors { pub fn into_array_of_vectors(self) -> Option> { match self.inner? { @@ -234,15 +342,19 @@ impl From> for VectorOrArrayOfVectors { mod test { use super::VectorOrArrayOfVectors; + fn embedding_from_str(s: &str) -> Result { + let value: serde_json::Value = serde_json::from_str(s).unwrap(); + deserr::deserialize(value) + } + #[test] fn array_of_vectors() { - let null: VectorOrArrayOfVectors = serde_json::from_str("null").unwrap(); - let empty: VectorOrArrayOfVectors = serde_json::from_str("[]").unwrap(); - let one: VectorOrArrayOfVectors = serde_json::from_str("[0.1]").unwrap(); - let two: VectorOrArrayOfVectors = serde_json::from_str("[0.1, 0.2]").unwrap(); - let one_vec: VectorOrArrayOfVectors = serde_json::from_str("[[0.1, 0.2]]").unwrap(); - let two_vecs: VectorOrArrayOfVectors = - serde_json::from_str("[[0.1, 0.2], [0.3, 0.4]]").unwrap(); + let null = embedding_from_str("null").unwrap(); + let empty = embedding_from_str("[]").unwrap(); + let one = embedding_from_str("[0.1]").unwrap(); + let two = embedding_from_str("[0.1, 0.2]").unwrap(); + let one_vec = embedding_from_str("[[0.1, 0.2]]").unwrap(); + let two_vecs = embedding_from_str("[[0.1, 0.2], [0.3, 0.4]]").unwrap(); insta::assert_json_snapshot!(null.into_array_of_vectors(), @"null"); insta::assert_json_snapshot!(empty.into_array_of_vectors(), @"[]"); From ce08dc509bc02280bdbb3143a1f90b83a8343542 Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 27 Jun 2024 11:51:45 +0200 Subject: [PATCH 106/110] add more tests and improve the location of the error --- meilisearch/tests/vector/mod.rs | 82 ++++++++++++++++++++++++++++-- milli/src/vector/parsed_vectors.rs | 23 ++++++--- 2 files changed, 95 insertions(+), 10 deletions(-) diff --git a/meilisearch/tests/vector/mod.rs b/meilisearch/tests/vector/mod.rs index dcefe2460..0343ab785 100644 --- a/meilisearch/tests/vector/mod.rs +++ b/meilisearch/tests/vector/mod.rs @@ -375,18 +375,92 @@ async fn user_provided_embeddings_error() { let (value, code) = index.add_documents(documents, None).await; snapshot!(code, @"202 Accepted"); let task = index.wait_task(value.uid()).await; + snapshot!(task["status"], @r###""succeeded""###); + + let documents = + json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "regenerate": false }}}); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + let task = index.wait_task(value.uid()).await; + snapshot!(task["status"], @r###""succeeded""###); + + let documents = json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "regenerate": false, "embeddings": [0.1, [0.2, 0.3]] }}}); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + let task = index.wait_task(value.uid()).await; snapshot!(task, @r###" { - "uid": 8, + "uid": 10, "indexUid": "doggo", - "status": "succeeded", + "status": "failed", "type": "documentAdditionOrUpdate", "canceledBy": null, "details": { "receivedDocuments": 1, - "indexedDocuments": 1 + "indexedDocuments": 0 + }, + "error": { + "message": "Bad embedder configuration in the document with id: `\"0\"`. Invalid value type at `.manual.embeddings[1]`: expected a number, but found an array: `[0.2,0.3]`", + "code": "invalid_vectors_type", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_vectors_type" + }, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); + + let documents = json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "regenerate": false, "embeddings": [[0.1, 0.2], 0.3] }}}); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + let task = index.wait_task(value.uid()).await; + snapshot!(task, @r###" + { + "uid": 11, + "indexUid": "doggo", + "status": "failed", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 0 + }, + "error": { + "message": "Bad embedder configuration in the document with id: `\"0\"`. Invalid value type at `.manual.embeddings[1]`: expected an array, but found a number: `0.3`", + "code": "invalid_vectors_type", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_vectors_type" + }, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); + + let documents = json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "regenerate": false, "embeddings": [[0.1, true], 0.3] }}}); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + let task = index.wait_task(value.uid()).await; + snapshot!(task, @r###" + { + "uid": 12, + "indexUid": "doggo", + "status": "failed", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 0 + }, + "error": { + "message": "Bad embedder configuration in the document with id: `\"0\"`. Invalid value type at `.manual.embeddings[0][1]`: expected a number, but found a boolean: `true`", + "code": "invalid_vectors_type", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_vectors_type" }, - "error": null, "duration": "[duration]", "enqueuedAt": "[date]", "startedAt": "[date]", diff --git a/milli/src/vector/parsed_vectors.rs b/milli/src/vector/parsed_vectors.rs index f934953fd..f555b39ae 100644 --- a/milli/src/vector/parsed_vectors.rs +++ b/milli/src/vector/parsed_vectors.rs @@ -258,7 +258,6 @@ impl Deserr for VectorOrArrayOfVectors { deserr::Value::Null => Ok(VectorOrArrayOfVectors { inner: None }), deserr::Value::Sequence(seq) => { let mut iter = seq.into_iter(); - let location = location.push_index(0); match iter.next().map(|v| v.into_value()) { None => { // With the strange way serde serialize the `Either`, we must send the left part @@ -266,10 +265,16 @@ impl Deserr for VectorOrArrayOfVectors { Ok(VectorOrArrayOfVectors { inner: Some(either::Either::Left(Vec::new())) }) } Some(val @ deserr::Value::Sequence(_)) => { - let first = Embedding::deserialize_from_value(val, location)?; + let first = Embedding::deserialize_from_value(val, location.push_index(0))?; let mut collect = vec![first]; let mut tail = iter - .map(|v| Embedding::deserialize_from_value(v.into_value(), location)) + .enumerate() + .map(|(i, v)| { + Embedding::deserialize_from_value( + v.into_value(), + location.push_index(i + 1), + ) + }) .collect::, _>>()?; collect.append(&mut tail); @@ -280,9 +285,15 @@ impl Deserr for VectorOrArrayOfVectors { | val @ deserr::Value::NegativeInteger(_) | val @ deserr::Value::Float(_), ) => { - let first = ::deserialize_from_value(val, location)?; + let first = ::deserialize_from_value(val, location.push_index(0))?; let mut embedding = iter - .map(|v| ::deserialize_from_value(v.into_value(), location)) + .enumerate() + .map(|(i, v)| { + ::deserialize_from_value( + v.into_value(), + location.push_index(i + 1), + ) + }) .collect::, _>>()?; embedding.insert(0, first); Ok(VectorOrArrayOfVectors { inner: Some(either::Either::Right(embedding)) }) @@ -293,7 +304,7 @@ impl Deserr for VectorOrArrayOfVectors { actual: value, accepted: &[deserr::ValueKind::Sequence, deserr::ValueKind::Float], }, - location, + location.push_index(0), ))), } } From 8c4921b9ddd5fe3e2a9547633357b3c34f8a9761 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 27 Jun 2024 14:17:33 +0200 Subject: [PATCH 107/110] Add failing test on limit+offset for hybrid search --- meilisearch/tests/search/hybrid.rs | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/meilisearch/tests/search/hybrid.rs b/meilisearch/tests/search/hybrid.rs index 31b2940d8..d8069ea5c 100644 --- a/meilisearch/tests/search/hybrid.rs +++ b/meilisearch/tests/search/hybrid.rs @@ -150,6 +150,35 @@ async fn simple_search() { snapshot!(response["semanticHitCount"], @"3"); } +#[actix_rt::test] +async fn limit_offset() { + let server = Server::new().await; + let index = index_with_documents_user_provided(&server, &SIMPLE_SEARCH_DOCUMENTS_VEC).await; + + let (response, code) = index + .search_post( + json!({"q": "Captain", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.2}, "retrieveVectors": true, "offset": 1, "limit": 1}), + ) + .await; + snapshot!(code, @"200 OK"); + snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}}},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}}}]"###); + snapshot!(response["semanticHitCount"], @"0"); + assert_eq!(response["hits"].as_array().unwrap().len(), 1); + + let server = Server::new().await; + let index = index_with_documents_user_provided(&server, &SIMPLE_SEARCH_DOCUMENTS_VEC).await; + + let (response, code) = index + .search_post( + json!({"q": "Captain", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.9}, "retrieveVectors": true, "offset": 1, "limit": 1}), + ) + .await; + snapshot!(code, @"200 OK"); + snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}}}]"###); + snapshot!(response["semanticHitCount"], @"1"); + assert_eq!(response["hits"].as_array().unwrap().len(), 1); +} + #[actix_rt::test] async fn simple_search_hf() { let server = Server::new().await; From e53de15b8e7014232924dc107697ebe8e36adc2b Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 27 Jun 2024 14:21:48 +0200 Subject: [PATCH 108/110] Fix behavior of limit and offset for hybrid search when keyword results are returned early The test is fixed --- meilisearch/tests/search/hybrid.rs | 2 +- milli/src/search/hybrid.rs | 47 ++++++++++++++++++++++++++++-- 2 files changed, 45 insertions(+), 4 deletions(-) diff --git a/meilisearch/tests/search/hybrid.rs b/meilisearch/tests/search/hybrid.rs index d8069ea5c..02768bf60 100644 --- a/meilisearch/tests/search/hybrid.rs +++ b/meilisearch/tests/search/hybrid.rs @@ -161,7 +161,7 @@ async fn limit_offset() { ) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}}},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}}}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}}}]"###); snapshot!(response["semanticHitCount"], @"0"); assert_eq!(response["hits"].as_array().unwrap().len(), 1); diff --git a/milli/src/search/hybrid.rs b/milli/src/search/hybrid.rs index 1c784097d..f7e1aa492 100644 --- a/milli/src/search/hybrid.rs +++ b/milli/src/search/hybrid.rs @@ -178,16 +178,16 @@ impl<'a> Search<'a> { // completely skip semantic search if the results of the keyword search are good enough if self.results_good_enough(&keyword_results, semantic_ratio) { - return Ok((keyword_results, Some(0))); + return Ok(return_keyword_results(self.limit, self.offset, keyword_results)); } // no vector search against placeholder search let Some(query) = search.query.take() else { - return Ok((keyword_results, Some(0))); + return Ok(return_keyword_results(self.limit, self.offset, keyword_results)); }; // no embedder, no semantic search let Some(SemanticSearch { vector, embedder_name, embedder }) = semantic else { - return Ok((keyword_results, Some(0))); + return Ok(return_keyword_results(self.limit, self.offset, keyword_results)); }; let vector_query = match vector { @@ -239,3 +239,44 @@ impl<'a> Search<'a> { true } } + +fn return_keyword_results( + limit: usize, + offset: usize, + SearchResult { + matching_words, + candidates, + mut documents_ids, + mut document_scores, + degraded, + used_negative_operator, + }: SearchResult, +) -> (SearchResult, Option) { + let (documents_ids, document_scores) = if offset >= documents_ids.len() || + // technically redudant because documents_ids.len() == document_scores.len(), + // defensive programming + offset >= document_scores.len() + { + (vec![], vec![]) + } else { + // PANICS: offset < len + documents_ids.rotate_left(offset); + documents_ids.truncate(limit); + + // PANICS: offset < len + document_scores.rotate_left(offset); + document_scores.truncate(limit); + (documents_ids, document_scores) + }; + ( + SearchResult { + matching_words, + candidates, + documents_ids, + document_scores, + degraded, + used_negative_operator, + }, + Some(0), + ) +} From ee14d5196c72d3f1122e1edff977b676f592655e Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 2 Jul 2024 15:18:30 +0200 Subject: [PATCH 109/110] fix the tests --- meilisearch/tests/common/index.rs | 9 ++++--- meilisearch/tests/documents/add_documents.rs | 2 +- meilisearch/tests/documents/errors.rs | 26 ++++++++++---------- meilisearch/tests/search/errors.rs | 4 +-- meilisearch/tests/similar/errors.rs | 4 +-- 5 files changed, 24 insertions(+), 21 deletions(-) diff --git a/meilisearch/tests/common/index.rs b/meilisearch/tests/common/index.rs index c8afa5e3e..045f8673c 100644 --- a/meilisearch/tests/common/index.rs +++ b/meilisearch/tests/common/index.rs @@ -185,7 +185,7 @@ impl Index<'_> { pub async fn get_document(&self, id: u64, options: Option) -> (Value, StatusCode) { let mut url = format!("/indexes/{}/documents/{}", urlencode(self.uid.as_ref()), id); if let Some(options) = options { - write!(url, "?{}", yaup::to_string(&options).unwrap()).unwrap(); + write!(url, "{}", yaup::to_string(&options).unwrap()).unwrap(); } self.service.get(url).await } @@ -202,7 +202,7 @@ impl Index<'_> { pub async fn get_all_documents(&self, options: GetAllDocumentsOptions) -> (Value, StatusCode) { let url = format!( - "/indexes/{}/documents?{}", + "/indexes/{}/documents{}", urlencode(self.uid.as_ref()), yaup::to_string(&options).unwrap() ); @@ -427,8 +427,11 @@ impl Index<'_> { #[derive(Debug, Default, serde::Serialize)] #[serde(rename_all = "camelCase")] pub struct GetAllDocumentsOptions { + #[serde(skip_serializing_if = "Option::is_none")] pub limit: Option, + #[serde(skip_serializing_if = "Option::is_none")] pub offset: Option, - pub retrieve_vectors: bool, + #[serde(skip_serializing_if = "Option::is_none")] pub fields: Option>, + pub retrieve_vectors: bool, } diff --git a/meilisearch/tests/documents/add_documents.rs b/meilisearch/tests/documents/add_documents.rs index 5e32564c7..289873f63 100644 --- a/meilisearch/tests/documents/add_documents.rs +++ b/meilisearch/tests/documents/add_documents.rs @@ -374,7 +374,7 @@ async fn add_csv_document_with_types() { "###); let (documents, code) = index.get_all_documents(GetAllDocumentsOptions::default()).await; - snapshot!(code, @"200 OK"); + // snapshot!(code, @"200 OK"); snapshot!(json_string!(documents), @r###" { "results": [ diff --git a/meilisearch/tests/documents/errors.rs b/meilisearch/tests/documents/errors.rs index 8e9a3a696..055f6512f 100644 --- a/meilisearch/tests/documents/errors.rs +++ b/meilisearch/tests/documents/errors.rs @@ -719,7 +719,7 @@ async fn fetch_document_by_filter() { let (response, code) = index.get_document_by_filter(json!(null)).await; snapshot!(code, @"400 Bad Request"); - snapshot!(json_string!(response), @r###" + snapshot!(response, @r###" { "message": "Invalid value type: expected an object, but found null", "code": "bad_request", @@ -730,7 +730,7 @@ async fn fetch_document_by_filter() { let (response, code) = index.get_document_by_filter(json!({ "offset": "doggo" })).await; snapshot!(code, @"400 Bad Request"); - snapshot!(json_string!(response), @r###" + snapshot!(response, @r###" { "message": "Invalid value type at `.offset`: expected a positive integer, but found a string: `\"doggo\"`", "code": "invalid_document_offset", @@ -741,7 +741,7 @@ async fn fetch_document_by_filter() { let (response, code) = index.get_document_by_filter(json!({ "limit": "doggo" })).await; snapshot!(code, @"400 Bad Request"); - snapshot!(json_string!(response), @r###" + snapshot!(response, @r###" { "message": "Invalid value type at `.limit`: expected a positive integer, but found a string: `\"doggo\"`", "code": "invalid_document_limit", @@ -752,7 +752,7 @@ async fn fetch_document_by_filter() { let (response, code) = index.get_document_by_filter(json!({ "fields": "doggo" })).await; snapshot!(code, @"400 Bad Request"); - snapshot!(json_string!(response), @r###" + snapshot!(response, @r###" { "message": "Invalid value type at `.fields`: expected an array, but found a string: `\"doggo\"`", "code": "invalid_document_fields", @@ -763,7 +763,7 @@ async fn fetch_document_by_filter() { let (response, code) = index.get_document_by_filter(json!({ "filter": true })).await; snapshot!(code, @"400 Bad Request"); - snapshot!(json_string!(response), @r###" + snapshot!(response, @r###" { "message": "Invalid syntax for the filter parameter: `expected String, Array, found: true`.", "code": "invalid_document_filter", @@ -774,7 +774,7 @@ async fn fetch_document_by_filter() { let (response, code) = index.get_document_by_filter(json!({ "filter": "cool doggo" })).await; snapshot!(code, @"400 Bad Request"); - snapshot!(json_string!(response), @r###" + snapshot!(response, @r###" { "message": "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` at `cool doggo`.\n1:11 cool doggo", "code": "invalid_document_filter", @@ -786,7 +786,7 @@ async fn fetch_document_by_filter() { let (response, code) = index.get_document_by_filter(json!({ "filter": "doggo = bernese" })).await; snapshot!(code, @"400 Bad Request"); - snapshot!(json_string!(response), @r###" + snapshot!(response, @r###" { "message": "Attribute `doggo` is not filterable. Available filterable attributes are: `color`.\n1:6 doggo = bernese", "code": "invalid_document_filter", @@ -803,7 +803,7 @@ async fn retrieve_vectors() { // GET ALL DOCUMENTS BY QUERY let (response, _code) = index.get_all_documents_raw("?retrieveVectors=tamo").await; - snapshot!(json_string!(response), @r###" + snapshot!(response, @r###" { "message": "Invalid value in parameter `retrieveVectors`: could not parse `tamo` as a boolean, expected either `true` or `false`", "code": "invalid_document_retrieve_vectors", @@ -812,7 +812,7 @@ async fn retrieve_vectors() { } "###); let (response, _code) = index.get_all_documents_raw("?retrieveVectors=true").await; - snapshot!(json_string!(response), @r###" + snapshot!(response, @r###" { "message": "Passing `retrieveVectors` as a parameter requires enabling the `vector store` experimental feature. See https://github.com/meilisearch/product/discussions/677", "code": "feature_not_enabled", @@ -824,7 +824,7 @@ async fn retrieve_vectors() { // FETCH ALL DOCUMENTS BY POST let (response, _code) = index.get_document_by_filter(json!({ "retrieveVectors": "tamo" })).await; - snapshot!(json_string!(response), @r###" + snapshot!(response, @r###" { "message": "Invalid value type at `.retrieveVectors`: expected a boolean, but found a string: `\"tamo\"`", "code": "invalid_document_retrieve_vectors", @@ -833,7 +833,7 @@ async fn retrieve_vectors() { } "###); let (response, _code) = index.get_document_by_filter(json!({ "retrieveVectors": true })).await; - snapshot!(json_string!(response), @r###" + snapshot!(response, @r###" { "message": "Passing `retrieveVectors` as a parameter requires enabling the `vector store` experimental feature. See https://github.com/meilisearch/product/discussions/677", "code": "feature_not_enabled", @@ -844,7 +844,7 @@ async fn retrieve_vectors() { // GET A SINGLE DOCUMENT let (response, _code) = index.get_document(0, Some(json!({"retrieveVectors": "tamo"}))).await; - snapshot!(json_string!(response), @r###" + snapshot!(response, @r###" { "message": "Invalid value in parameter `retrieveVectors`: could not parse `tamo` as a boolean, expected either `true` or `false`", "code": "invalid_document_retrieve_vectors", @@ -853,7 +853,7 @@ async fn retrieve_vectors() { } "###); let (response, _code) = index.get_document(0, Some(json!({"retrieveVectors": true}))).await; - snapshot!(json_string!(response), @r###" + snapshot!(response, @r###" { "message": "Passing `retrieveVectors` as a parameter requires enabling the `vector store` experimental feature. See https://github.com/meilisearch/product/discussions/677", "code": "feature_not_enabled", diff --git a/meilisearch/tests/search/errors.rs b/meilisearch/tests/search/errors.rs index a95797227..b615902c2 100644 --- a/meilisearch/tests/search/errors.rs +++ b/meilisearch/tests/search/errors.rs @@ -212,7 +212,7 @@ async fn search_bad_retrieve_vectors() { } "###); - let (response, code) = index.search_get("retrieveVectors=").await; + let (response, code) = index.search_get("?retrieveVectors=").await; snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r###" { @@ -223,7 +223,7 @@ async fn search_bad_retrieve_vectors() { } "###); - let (response, code) = index.search_get("retrieveVectors=doggo").await; + let (response, code) = index.search_get("?retrieveVectors=doggo").await; snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r###" { diff --git a/meilisearch/tests/similar/errors.rs b/meilisearch/tests/similar/errors.rs index 8b2bb57a4..a6d7a3da6 100644 --- a/meilisearch/tests/similar/errors.rs +++ b/meilisearch/tests/similar/errors.rs @@ -785,7 +785,7 @@ async fn similar_bad_retrieve_vectors() { } "###); - let (response, code) = index.similar_get("retrieveVectors=").await; + let (response, code) = index.similar_get("?retrieveVectors=").await; snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r###" { @@ -796,7 +796,7 @@ async fn similar_bad_retrieve_vectors() { } "###); - let (response, code) = index.similar_get("retrieveVectors=doggo").await; + let (response, code) = index.similar_get("?retrieveVectors=doggo").await; snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r###" { From 3d9befd64fd03bd6c63c85e057569e852b547c4c Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 2 Jul 2024 15:30:16 +0200 Subject: [PATCH 110/110] fix warning --- meilisearch/tests/documents/add_documents.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/meilisearch/tests/documents/add_documents.rs b/meilisearch/tests/documents/add_documents.rs index 289873f63..5e32564c7 100644 --- a/meilisearch/tests/documents/add_documents.rs +++ b/meilisearch/tests/documents/add_documents.rs @@ -374,7 +374,7 @@ async fn add_csv_document_with_types() { "###); let (documents, code) = index.get_all_documents(GetAllDocumentsOptions::default()).await; - // snapshot!(code, @"200 OK"); + snapshot!(code, @"200 OK"); snapshot!(json_string!(documents), @r###" { "results": [