4929: Add facets support to federated r=Kerollmops a=dureuill

# Pull Request

## Related issue 

- Fixes #4932 (sprint issue)
- Fixes  #4913 (user-opened issue)

## What does this PR do?

See [public usage](https://meilisearch.notion.site/v1-11-Federated-search-59b30e03383c40729d7541a3dffb0069)

> [!CAUTION]
> This PR introduces a 🚨**breaking change**🚨: `queries.facets` when `federation` is present and non-`null` is now **an error**

### Implementation standpoint:

- Facet distribution: fix issue where truncated facet distribution would have a wrong order
- facet distribution: implement Display for OrderBy


Co-authored-by: Louis Dureuil <louis@meilisearch.com>
This commit is contained in:
meili-bors[bot] 2024-09-18 09:47:20 +00:00 committed by GitHub
commit 29c3aca72a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 1958 additions and 73 deletions

View File

@ -238,8 +238,14 @@ InvalidIndexLimit , InvalidRequest , BAD_REQUEST ;
InvalidIndexOffset , InvalidRequest , BAD_REQUEST ; InvalidIndexOffset , InvalidRequest , BAD_REQUEST ;
InvalidIndexPrimaryKey , InvalidRequest , BAD_REQUEST ; InvalidIndexPrimaryKey , InvalidRequest , BAD_REQUEST ;
InvalidIndexUid , InvalidRequest , BAD_REQUEST ; InvalidIndexUid , InvalidRequest , BAD_REQUEST ;
InvalidMultiSearchFacets , InvalidRequest , BAD_REQUEST ;
InvalidMultiSearchFacetsByIndex , InvalidRequest , BAD_REQUEST ;
InvalidMultiSearchFacetOrder , InvalidRequest , BAD_REQUEST ;
InvalidMultiSearchFederated , InvalidRequest , BAD_REQUEST ; InvalidMultiSearchFederated , InvalidRequest , BAD_REQUEST ;
InvalidMultiSearchFederationOptions , InvalidRequest , BAD_REQUEST ; InvalidMultiSearchFederationOptions , InvalidRequest , BAD_REQUEST ;
InvalidMultiSearchMaxValuesPerFacet , InvalidRequest , BAD_REQUEST ;
InvalidMultiSearchMergeFacets , InvalidRequest , BAD_REQUEST ;
InvalidMultiSearchQueryFacets , InvalidRequest , BAD_REQUEST ;
InvalidMultiSearchQueryPagination , InvalidRequest , BAD_REQUEST ; InvalidMultiSearchQueryPagination , InvalidRequest , BAD_REQUEST ;
InvalidMultiSearchQueryRankingRules , InvalidRequest , BAD_REQUEST ; InvalidMultiSearchQueryRankingRules , InvalidRequest , BAD_REQUEST ;
InvalidMultiSearchWeight , InvalidRequest , BAD_REQUEST ; InvalidMultiSearchWeight , InvalidRequest , BAD_REQUEST ;

View File

@ -1,3 +1,4 @@
use std::borrow::Borrow;
use std::error::Error; use std::error::Error;
use std::fmt; use std::fmt;
use std::str::FromStr; use std::str::FromStr;
@ -8,7 +9,7 @@ use crate::error::{Code, ErrorCode};
/// An index uid is composed of only ascii alphanumeric characters, - and _, between 1 and 400 /// An index uid is composed of only ascii alphanumeric characters, - and _, between 1 and 400
/// bytes long /// bytes long
#[derive(Debug, Clone, PartialEq, Eq, Deserr)] #[derive(Debug, Clone, PartialEq, Eq, Deserr, PartialOrd, Ord)]
#[deserr(try_from(String) = IndexUid::try_from -> IndexUidFormatError)] #[deserr(try_from(String) = IndexUid::try_from -> IndexUidFormatError)]
pub struct IndexUid(String); pub struct IndexUid(String);
@ -70,6 +71,12 @@ impl From<IndexUid> for String {
} }
} }
impl Borrow<String> for IndexUid {
fn borrow(&self) -> &String {
&self.0
}
}
#[derive(Debug)] #[derive(Debug)]
pub struct IndexUidFormatError { pub struct IndexUidFormatError {
pub invalid_uid: String, pub invalid_uid: String,

View File

@ -4,6 +4,7 @@ use byte_unit::{Byte, UnitType};
use meilisearch_types::document_formats::{DocumentFormatError, PayloadType}; use meilisearch_types::document_formats::{DocumentFormatError, PayloadType};
use meilisearch_types::error::{Code, ErrorCode, ResponseError}; use meilisearch_types::error::{Code, ErrorCode, ResponseError};
use meilisearch_types::index_uid::{IndexUid, IndexUidFormatError}; use meilisearch_types::index_uid::{IndexUid, IndexUidFormatError};
use meilisearch_types::milli::OrderBy;
use serde_json::Value; use serde_json::Value;
use tokio::task::JoinError; use tokio::task::JoinError;
@ -27,10 +28,20 @@ pub enum MeilisearchHttpError {
EmptyFilter, EmptyFilter,
#[error("Invalid syntax for the filter parameter: `expected {}, found: {1}`.", .0.join(", "))] #[error("Invalid syntax for the filter parameter: `expected {}, found: {1}`.", .0.join(", "))]
InvalidExpression(&'static [&'static str], Value), InvalidExpression(&'static [&'static str], Value),
#[error("Using `federationOptions` is not allowed in a non-federated search.\n Hint: remove `federationOptions` from query #{0} or add `federation: {{}}` to the request.")] #[error("Using `federationOptions` is not allowed in a non-federated search.\n - Hint: remove `federationOptions` from query #{0} or add `federation` to the request.")]
FederationOptionsInNonFederatedRequest(usize), FederationOptionsInNonFederatedRequest(usize),
#[error("Inside `.queries[{0}]`: Using pagination options is not allowed in federated queries.\n Hint: remove `{1}` from query #{0} or remove `federation: {{}}` from the request")] #[error("Inside `.queries[{0}]`: Using pagination options is not allowed in federated queries.\n - Hint: remove `{1}` from query #{0} or remove `federation` from the request\n - Hint: pass `federation.limit` and `federation.offset` for pagination in federated search")]
PaginationInFederatedQuery(usize, &'static str), PaginationInFederatedQuery(usize, &'static str),
#[error("Inside `.queries[{0}]`: Using facet options is not allowed in federated queries.\n - Hint: remove `facets` from query #{0} or remove `federation` from the request\n - Hint: pass `federation.facetsByIndex.{1}: {2:?}` for facets in federated search")]
FacetsInFederatedQuery(usize, String, Vec<String>),
#[error("Inconsistent order for values in facet `{facet}`: index `{previous_uid}` orders {previous_facet_order}, but index `{current_uid}` orders {index_facet_order}.\n - Hint: Remove `federation.mergeFacets` or change `faceting.sortFacetValuesBy` to be consistent in settings.")]
InconsistentFacetOrder {
facet: String,
previous_facet_order: OrderBy,
previous_uid: String,
index_facet_order: OrderBy,
current_uid: String,
},
#[error("A {0} payload is missing.")] #[error("A {0} payload is missing.")]
MissingPayload(PayloadType), MissingPayload(PayloadType),
#[error("Too many search requests running at the same time: {0}. Retry after 10s.")] #[error("Too many search requests running at the same time: {0}. Retry after 10s.")]
@ -96,6 +107,10 @@ impl ErrorCode for MeilisearchHttpError {
MeilisearchHttpError::PaginationInFederatedQuery(_, _) => { MeilisearchHttpError::PaginationInFederatedQuery(_, _) => {
Code::InvalidMultiSearchQueryPagination Code::InvalidMultiSearchQueryPagination
} }
MeilisearchHttpError::FacetsInFederatedQuery(..) => Code::InvalidMultiSearchQueryFacets,
MeilisearchHttpError::InconsistentFacetOrder { .. } => {
Code::InvalidMultiSearchFacetOrder
}
} }
} }
} }

View File

@ -9,20 +9,24 @@ use std::vec::{IntoIter, Vec};
use actix_http::StatusCode; use actix_http::StatusCode;
use index_scheduler::{IndexScheduler, RoFeatures}; use index_scheduler::{IndexScheduler, RoFeatures};
use indexmap::IndexMap;
use meilisearch_types::deserr::DeserrJsonError; use meilisearch_types::deserr::DeserrJsonError;
use meilisearch_types::error::deserr_codes::{ use meilisearch_types::error::deserr_codes::{
InvalidMultiSearchWeight, InvalidSearchLimit, InvalidSearchOffset, InvalidMultiSearchFacetsByIndex, InvalidMultiSearchMaxValuesPerFacet,
InvalidMultiSearchMergeFacets, InvalidMultiSearchWeight, InvalidSearchLimit,
InvalidSearchOffset,
}; };
use meilisearch_types::error::ResponseError; use meilisearch_types::error::ResponseError;
use meilisearch_types::index_uid::IndexUid;
use meilisearch_types::milli::score_details::{ScoreDetails, ScoreValue}; use meilisearch_types::milli::score_details::{ScoreDetails, ScoreValue};
use meilisearch_types::milli::{self, DocumentId, TimeBudget}; use meilisearch_types::milli::{self, DocumentId, OrderBy, TimeBudget};
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use serde::Serialize; use serde::Serialize;
use super::ranking_rules::{self, RankingRules}; use super::ranking_rules::{self, RankingRules};
use super::{ use super::{
prepare_search, AttributesFormat, HitMaker, HitsInfo, RetrieveVectors, SearchHit, SearchKind, compute_facet_distribution_stats, prepare_search, AttributesFormat, ComputedFacets, FacetStats,
SearchQuery, SearchQueryWithIndex, HitMaker, HitsInfo, RetrieveVectors, SearchHit, SearchKind, SearchQuery, SearchQueryWithIndex,
}; };
use crate::error::MeilisearchHttpError; use crate::error::MeilisearchHttpError;
use crate::routes::indexes::search::search_kind; use crate::routes::indexes::search::search_kind;
@ -73,6 +77,17 @@ pub struct Federation {
pub limit: usize, pub limit: usize,
#[deserr(default = super::DEFAULT_SEARCH_OFFSET(), error = DeserrJsonError<InvalidSearchOffset>)] #[deserr(default = super::DEFAULT_SEARCH_OFFSET(), error = DeserrJsonError<InvalidSearchOffset>)]
pub offset: usize, pub offset: usize,
#[deserr(default, error = DeserrJsonError<InvalidMultiSearchFacetsByIndex>)]
pub facets_by_index: BTreeMap<IndexUid, Option<Vec<String>>>,
#[deserr(default, error = DeserrJsonError<InvalidMultiSearchMergeFacets>)]
pub merge_facets: Option<MergeFacets>,
}
#[derive(Copy, Clone, Debug, deserr::Deserr, Default)]
#[deserr(error = DeserrJsonError<InvalidMultiSearchMergeFacets>, rename_all = camelCase, deny_unknown_fields)]
pub struct MergeFacets {
#[deserr(default, error = DeserrJsonError<InvalidMultiSearchMaxValuesPerFacet>)]
pub max_values_per_facet: Option<usize>,
} }
#[derive(Debug, deserr::Deserr)] #[derive(Debug, deserr::Deserr)]
@ -82,7 +97,7 @@ pub struct FederatedSearch {
#[deserr(default)] #[deserr(default)]
pub federation: Option<Federation>, pub federation: Option<Federation>,
} }
#[derive(Serialize, Clone, PartialEq)] #[derive(Serialize, Clone)]
#[serde(rename_all = "camelCase")] #[serde(rename_all = "camelCase")]
pub struct FederatedSearchResult { pub struct FederatedSearchResult {
pub hits: Vec<SearchHit>, pub hits: Vec<SearchHit>,
@ -93,6 +108,13 @@ pub struct FederatedSearchResult {
#[serde(skip_serializing_if = "Option::is_none")] #[serde(skip_serializing_if = "Option::is_none")]
pub semantic_hit_count: Option<u32>, pub semantic_hit_count: Option<u32>,
#[serde(skip_serializing_if = "Option::is_none")]
pub facet_distribution: Option<BTreeMap<String, IndexMap<String, u64>>>,
#[serde(skip_serializing_if = "Option::is_none")]
pub facet_stats: Option<BTreeMap<String, FacetStats>>,
#[serde(skip_serializing_if = "FederatedFacets::is_empty")]
pub facets_by_index: FederatedFacets,
// These fields are only used for analytics purposes // These fields are only used for analytics purposes
#[serde(skip)] #[serde(skip)]
pub degraded: bool, pub degraded: bool,
@ -109,6 +131,9 @@ impl fmt::Debug for FederatedSearchResult {
semantic_hit_count, semantic_hit_count,
degraded, degraded,
used_negative_operator, used_negative_operator,
facet_distribution,
facet_stats,
facets_by_index,
} = self; } = self;
let mut debug = f.debug_struct("SearchResult"); let mut debug = f.debug_struct("SearchResult");
@ -122,9 +147,18 @@ impl fmt::Debug for FederatedSearchResult {
if *degraded { if *degraded {
debug.field("degraded", degraded); debug.field("degraded", degraded);
} }
if let Some(facet_distribution) = facet_distribution {
debug.field("facet_distribution", &facet_distribution);
}
if let Some(facet_stats) = facet_stats {
debug.field("facet_stats", &facet_stats);
}
if let Some(semantic_hit_count) = semantic_hit_count { if let Some(semantic_hit_count) = semantic_hit_count {
debug.field("semantic_hit_count", &semantic_hit_count); debug.field("semantic_hit_count", &semantic_hit_count);
} }
if !facets_by_index.is_empty() {
debug.field("facets_by_index", &facets_by_index);
}
debug.finish() debug.finish()
} }
@ -313,16 +347,104 @@ struct SearchHitByIndex {
} }
struct SearchResultByIndex { struct SearchResultByIndex {
index: String,
hits: Vec<SearchHitByIndex>, hits: Vec<SearchHitByIndex>,
candidates: RoaringBitmap, estimated_total_hits: usize,
degraded: bool, degraded: bool,
used_negative_operator: bool, used_negative_operator: bool,
facets: Option<ComputedFacets>,
}
#[derive(Debug, Clone, Default, Serialize)]
pub struct FederatedFacets(pub BTreeMap<String, ComputedFacets>);
impl FederatedFacets {
pub fn insert(&mut self, index: String, facets: Option<ComputedFacets>) {
if let Some(facets) = facets {
self.0.insert(index, facets);
}
}
pub fn is_empty(&self) -> bool {
self.0.is_empty()
}
pub fn merge(
self,
MergeFacets { max_values_per_facet }: MergeFacets,
facet_order: BTreeMap<String, (String, OrderBy)>,
) -> Option<ComputedFacets> {
if self.is_empty() {
return None;
}
let mut distribution: BTreeMap<String, _> = Default::default();
let mut stats: BTreeMap<String, FacetStats> = Default::default();
for facets_by_index in self.0.into_values() {
for (facet, index_distribution) in facets_by_index.distribution {
match distribution.entry(facet) {
std::collections::btree_map::Entry::Vacant(entry) => {
entry.insert(index_distribution);
}
std::collections::btree_map::Entry::Occupied(mut entry) => {
let distribution = entry.get_mut();
for (value, index_count) in index_distribution {
distribution
.entry(value)
.and_modify(|count| *count += index_count)
.or_insert(index_count);
}
}
}
}
for (facet, index_stats) in facets_by_index.stats {
match stats.entry(facet) {
std::collections::btree_map::Entry::Vacant(entry) => {
entry.insert(index_stats);
}
std::collections::btree_map::Entry::Occupied(mut entry) => {
let stats = entry.get_mut();
stats.min = f64::min(stats.min, index_stats.min);
stats.max = f64::max(stats.max, index_stats.max);
}
}
}
}
// fixup order
for (facet, values) in &mut distribution {
let order_by = facet_order.get(facet).map(|(_, order)| *order).unwrap_or_default();
match order_by {
OrderBy::Lexicographic => {
values.sort_unstable_by(|left, _, right, _| left.cmp(right))
}
OrderBy::Count => {
values.sort_unstable_by(|_, left, _, right| {
left.cmp(right)
// biggest first
.reverse()
})
}
}
if let Some(max_values_per_facet) = max_values_per_facet {
values.truncate(max_values_per_facet)
};
}
Some(ComputedFacets { distribution, stats })
}
} }
pub fn perform_federated_search( pub fn perform_federated_search(
index_scheduler: &IndexScheduler, index_scheduler: &IndexScheduler,
queries: Vec<SearchQueryWithIndex>, queries: Vec<SearchQueryWithIndex>,
federation: Federation, mut federation: Federation,
features: RoFeatures, features: RoFeatures,
) -> Result<FederatedSearchResult, ResponseError> { ) -> Result<FederatedSearchResult, ResponseError> {
let before_search = std::time::Instant::now(); let before_search = std::time::Instant::now();
@ -342,6 +464,16 @@ pub fn perform_federated_search(
.into()); .into());
} }
if let Some(facets) = federated_query.has_facets() {
let facets = facets.to_owned();
return Err(MeilisearchHttpError::FacetsInFederatedQuery(
query_index,
federated_query.index_uid.into_inner(),
facets,
)
.into());
}
let (index_uid, query, federation_options) = federated_query.into_index_query_federation(); let (index_uid, query, federation_options) = federated_query.into_index_query_federation();
queries_by_index.entry(index_uid.into_inner()).or_default().push(QueryByIndex { queries_by_index.entry(index_uid.into_inner()).or_default().push(QueryByIndex {
@ -353,13 +485,24 @@ pub fn perform_federated_search(
// 2. perform queries, merge and make hits index by index // 2. perform queries, merge and make hits index by index
let required_hit_count = federation.limit + federation.offset; let required_hit_count = federation.limit + federation.offset;
// In step (2), semantic_hit_count will be set to Some(0) if any search kind uses semantic // In step (2), semantic_hit_count will be set to Some(0) if any search kind uses semantic
// Then in step (3), we'll update its value if there is any semantic search // Then in step (3), we'll update its value if there is any semantic search
let mut semantic_hit_count = None; let mut semantic_hit_count = None;
let mut results_by_index = Vec::with_capacity(queries_by_index.len()); let mut results_by_index = Vec::with_capacity(queries_by_index.len());
let mut previous_query_data: Option<(RankingRules, usize, String)> = None; let mut previous_query_data: Option<(RankingRules, usize, String)> = None;
// remember the order and name of first index for each facet when merging with index settings
// to detect if the order is inconsistent for a facet.
let mut facet_order: Option<BTreeMap<String, (String, OrderBy)>> = match federation.merge_facets
{
Some(MergeFacets { .. }) => Some(Default::default()),
_ => None,
};
for (index_uid, queries) in queries_by_index { for (index_uid, queries) in queries_by_index {
let first_query_index = queries.first().map(|query| query.query_index);
let index = match index_scheduler.index(&index_uid) { let index = match index_scheduler.index(&index_uid) {
Ok(index) => index, Ok(index) => index,
Err(err) => { Err(err) => {
@ -367,9 +510,8 @@ pub fn perform_federated_search(
// Patch the HTTP status code to 400 as it defaults to 404 for `index_not_found`, but // Patch the HTTP status code to 400 as it defaults to 404 for `index_not_found`, but
// here the resource not found is not part of the URL. // here the resource not found is not part of the URL.
err.code = StatusCode::BAD_REQUEST; err.code = StatusCode::BAD_REQUEST;
if let Some(query) = queries.first() { if let Some(query_index) = first_query_index {
err.message = err.message = format!("Inside `.queries[{}]`: {}", query_index, err.message);
format!("Inside `.queries[{}]`: {}", query.query_index, err.message);
} }
return Err(err); return Err(err);
} }
@ -394,6 +536,23 @@ pub fn perform_federated_search(
let mut used_negative_operator = false; let mut used_negative_operator = false;
let mut candidates = RoaringBitmap::new(); let mut candidates = RoaringBitmap::new();
let facets_by_index = federation.facets_by_index.remove(&index_uid).flatten();
// TODO: recover the max size + facets_by_index as return value of this function so as not to ask it for all queries
if let Err(mut error) =
check_facet_order(&mut facet_order, &index_uid, &facets_by_index, &index, &rtxn)
{
error.message = format!(
"Inside `.federation.facetsByIndex.{index_uid}`: {error}{}",
if let Some(query_index) = first_query_index {
format!("\n - Note: index `{index_uid}` used in `.queries[{query_index}]`")
} else {
Default::default()
}
);
return Err(error);
}
// 2.1. Compute all candidates for each query in the index // 2.1. Compute all candidates for each query in the index
let mut results_by_query = Vec::with_capacity(queries.len()); let mut results_by_query = Vec::with_capacity(queries.len());
@ -562,34 +721,116 @@ pub fn perform_federated_search(
.collect(); .collect();
let merged_result = merged_result?; let merged_result = merged_result?;
let estimated_total_hits = candidates.len() as usize;
let facets = facets_by_index
.map(|facets_by_index| {
compute_facet_distribution_stats(
&facets_by_index,
&index,
&rtxn,
candidates,
super::Route::MultiSearch,
)
})
.transpose()
.map_err(|mut error| {
error.message = format!(
"Inside `.federation.facetsByIndex.{index_uid}`: {}{}",
error.message,
if let Some(query_index) = first_query_index {
format!("\n - Note: index `{index_uid}` used in `.queries[{query_index}]`")
} else {
Default::default()
}
);
error
})?;
results_by_index.push(SearchResultByIndex { results_by_index.push(SearchResultByIndex {
index: index_uid,
hits: merged_result, hits: merged_result,
candidates, estimated_total_hits,
degraded, degraded,
used_negative_operator, used_negative_operator,
facets,
}); });
} }
// bonus step, make sure to return an error if an index wants a non-faceted field, even if no query actually uses that index.
for (index_uid, facets) in federation.facets_by_index {
let index = match index_scheduler.index(&index_uid) {
Ok(index) => index,
Err(err) => {
let mut err = ResponseError::from(err);
// Patch the HTTP status code to 400 as it defaults to 404 for `index_not_found`, but
// here the resource not found is not part of the URL.
err.code = StatusCode::BAD_REQUEST;
err.message = format!(
"Inside `.federation.facetsByIndex.{index_uid}`: {}\n - Note: index `{index_uid}` is not used in queries",
err.message
);
return Err(err);
}
};
// Important: this is the only transaction we'll use for this index during this federated search
let rtxn = index.read_txn()?;
if let Err(mut error) =
check_facet_order(&mut facet_order, &index_uid, &facets, &index, &rtxn)
{
error.message = format!(
"Inside `.federation.facetsByIndex.{index_uid}`: {error}\n - Note: index `{index_uid}` is not used in queries",
);
return Err(error);
}
if let Some(facets) = facets {
if let Err(mut error) = compute_facet_distribution_stats(
&facets,
&index,
&rtxn,
Default::default(),
super::Route::MultiSearch,
) {
error.message =
format!("Inside `.federation.facetsByIndex.{index_uid}`: {}\n - Note: index `{index_uid}` is not used in queries", error.message);
return Err(error);
}
}
}
// 3. merge hits and metadata across indexes // 3. merge hits and metadata across indexes
// 3.1 merge metadata // 3.1 merge metadata
let (estimated_total_hits, degraded, used_negative_operator) = { let (estimated_total_hits, degraded, used_negative_operator, facets) = {
let mut estimated_total_hits = 0; let mut estimated_total_hits = 0;
let mut degraded = false; let mut degraded = false;
let mut used_negative_operator = false; let mut used_negative_operator = false;
let mut facets: FederatedFacets = FederatedFacets::default();
for SearchResultByIndex { for SearchResultByIndex {
index,
hits: _, hits: _,
candidates, estimated_total_hits: estimated_total_hits_by_index,
facets: facets_by_index,
degraded: degraded_by_index, degraded: degraded_by_index,
used_negative_operator: used_negative_operator_by_index, used_negative_operator: used_negative_operator_by_index,
} in &results_by_index } in &mut results_by_index
{ {
estimated_total_hits += candidates.len() as usize; estimated_total_hits += *estimated_total_hits_by_index;
degraded |= *degraded_by_index; degraded |= *degraded_by_index;
used_negative_operator |= *used_negative_operator_by_index; used_negative_operator |= *used_negative_operator_by_index;
let facets_by_index = std::mem::take(facets_by_index);
let index = std::mem::take(index);
facets.insert(index, facets_by_index);
} }
(estimated_total_hits, degraded, used_negative_operator) (estimated_total_hits, degraded, used_negative_operator, facets)
}; };
// 3.2 merge hits // 3.2 merge hits
@ -606,6 +847,20 @@ pub fn perform_federated_search(
.map(|hit| hit.hit) .map(|hit| hit.hit)
.collect(); .collect();
let (facet_distribution, facet_stats, facets_by_index) =
match federation.merge_facets.zip(facet_order) {
Some((merge_facets, facet_order)) => {
let facets = facets.merge(merge_facets, facet_order);
let (facet_distribution, facet_stats) = facets
.map(|ComputedFacets { distribution, stats }| (distribution, stats))
.unzip();
(facet_distribution, facet_stats, FederatedFacets::default())
}
None => (None, None, facets),
};
let search_result = FederatedSearchResult { let search_result = FederatedSearchResult {
hits: merged_hits, hits: merged_hits,
processing_time_ms: before_search.elapsed().as_millis(), processing_time_ms: before_search.elapsed().as_millis(),
@ -617,7 +872,39 @@ pub fn perform_federated_search(
semantic_hit_count, semantic_hit_count,
degraded, degraded,
used_negative_operator, used_negative_operator,
facet_distribution,
facet_stats,
facets_by_index,
}; };
Ok(search_result) Ok(search_result)
} }
fn check_facet_order(
facet_order: &mut Option<BTreeMap<String, (String, OrderBy)>>,
current_index: &str,
facets_by_index: &Option<Vec<String>>,
index: &milli::Index,
rtxn: &milli::heed::RoTxn<'_>,
) -> Result<(), ResponseError> {
if let (Some(facet_order), Some(facets_by_index)) = (facet_order, facets_by_index) {
let index_facet_order = index.sort_facet_values_by(rtxn)?;
for facet in facets_by_index {
let index_facet_order = index_facet_order.get(facet);
let (previous_index, previous_facet_order) = facet_order
.entry(facet.to_owned())
.or_insert_with(|| (current_index.to_owned(), index_facet_order));
if previous_facet_order != &index_facet_order {
return Err(MeilisearchHttpError::InconsistentFacetOrder {
facet: facet.clone(),
previous_facet_order: *previous_facet_order,
previous_uid: previous_index.clone(),
current_uid: current_index.to_owned(),
index_facet_order,
}
.into());
}
}
};
Ok(())
}

View File

@ -441,9 +441,6 @@ pub struct SearchQueryWithIndex {
} }
impl SearchQueryWithIndex { impl SearchQueryWithIndex {
pub fn has_federation_options(&self) -> bool {
self.federation_options.is_some()
}
pub fn has_pagination(&self) -> Option<&'static str> { pub fn has_pagination(&self) -> Option<&'static str> {
if self.offset.is_some() { if self.offset.is_some() {
Some("offset") Some("offset")
@ -458,6 +455,10 @@ impl SearchQueryWithIndex {
} }
} }
pub fn has_facets(&self) -> Option<&[String]> {
self.facets.as_deref().filter(|v| !v.is_empty())
}
pub fn into_index_query_federation(self) -> (IndexUid, SearchQuery, Option<FederationOptions>) { pub fn into_index_query_federation(self) -> (IndexUid, SearchQuery, Option<FederationOptions>) {
let SearchQueryWithIndex { let SearchQueryWithIndex {
index_uid, index_uid,
@ -987,39 +988,13 @@ pub fn perform_search(
HitsInfo::OffsetLimit { limit, offset, estimated_total_hits: number_of_hits } HitsInfo::OffsetLimit { limit, offset, estimated_total_hits: number_of_hits }
}; };
let (facet_distribution, facet_stats) = match facets { let (facet_distribution, facet_stats) = facets
Some(ref fields) => { .map(move |facets| {
let mut facet_distribution = index.facets_distribution(&rtxn); compute_facet_distribution_stats(&facets, index, &rtxn, candidates, Route::Search)
})
let max_values_by_facet = index .transpose()?
.max_values_per_facet(&rtxn) .map(|ComputedFacets { distribution, stats }| (distribution, stats))
.map_err(milli::Error::from)? .unzip();
.map(|x| x as usize)
.unwrap_or(DEFAULT_VALUES_PER_FACET);
facet_distribution.max_values_per_facet(max_values_by_facet);
let sort_facet_values_by =
index.sort_facet_values_by(&rtxn).map_err(milli::Error::from)?;
if fields.iter().all(|f| f != "*") {
let fields: Vec<_> =
fields.iter().map(|n| (n, sort_facet_values_by.get(n))).collect();
facet_distribution.facets(fields);
}
let distribution = facet_distribution
.candidates(candidates)
.default_order_by(sort_facet_values_by.get("*"))
.execute()?;
let stats = facet_distribution.compute_stats()?;
(Some(distribution), Some(stats))
}
None => (None, None),
};
let facet_stats = facet_stats.map(|stats| {
stats.into_iter().map(|(k, (min, max))| (k, FacetStats { min, max })).collect()
});
let result = SearchResult { let result = SearchResult {
hits: documents, hits: documents,
@ -1035,6 +1010,61 @@ pub fn perform_search(
Ok(result) Ok(result)
} }
#[derive(Debug, Clone, Default, Serialize)]
pub struct ComputedFacets {
pub distribution: BTreeMap<String, IndexMap<String, u64>>,
pub stats: BTreeMap<String, FacetStats>,
}
enum Route {
Search,
MultiSearch,
}
fn compute_facet_distribution_stats<S: AsRef<str>>(
facets: &[S],
index: &Index,
rtxn: &RoTxn,
candidates: roaring::RoaringBitmap,
route: Route,
) -> Result<ComputedFacets, ResponseError> {
let mut facet_distribution = index.facets_distribution(rtxn);
let max_values_by_facet = index
.max_values_per_facet(rtxn)
.map_err(milli::Error::from)?
.map(|x| x as usize)
.unwrap_or(DEFAULT_VALUES_PER_FACET);
facet_distribution.max_values_per_facet(max_values_by_facet);
let sort_facet_values_by = index.sort_facet_values_by(rtxn).map_err(milli::Error::from)?;
// add specific facet if there is no placeholder
if facets.iter().all(|f| f.as_ref() != "*") {
let fields: Vec<_> =
facets.iter().map(|n| (n, sort_facet_values_by.get(n.as_ref()))).collect();
facet_distribution.facets(fields);
}
let distribution = facet_distribution
.candidates(candidates)
.default_order_by(sort_facet_values_by.get("*"))
.execute()
.map_err(|error| match (error, route) {
(
error @ milli::Error::UserError(milli::UserError::InvalidFacetsDistribution {
..
}),
Route::MultiSearch,
) => ResponseError::from_msg(error.to_string(), Code::InvalidMultiSearchFacets),
(error, _) => error.into(),
})?;
let stats = facet_distribution.compute_stats()?;
let stats = stats.into_iter().map(|(k, (min, max))| (k, FacetStats { min, max })).collect();
Ok(ComputedFacets { distribution, stats })
}
pub fn search_from_kind( pub fn search_from_kind(
search_kind: SearchKind, search_kind: SearchKind,
search: milli::Search<'_>, search: milli::Search<'_>,

File diff suppressed because it is too large Load Diff

View File

@ -1,4 +1,5 @@
use std::collections::{BTreeMap, HashMap, HashSet}; use std::collections::{BTreeMap, HashMap, HashSet};
use std::fmt::Display;
use std::ops::ControlFlow; use std::ops::ControlFlow;
use std::{fmt, mem}; use std::{fmt, mem};
@ -37,6 +38,15 @@ pub enum OrderBy {
Count, Count,
} }
impl Display for OrderBy {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
OrderBy::Lexicographic => f.write_str("alphabetically"),
OrderBy::Count => f.write_str("by count"),
}
}
}
pub struct FacetDistribution<'a> { pub struct FacetDistribution<'a> {
facets: Option<HashMap<String, OrderBy>>, facets: Option<HashMap<String, OrderBy>>,
candidates: Option<RoaringBitmap>, candidates: Option<RoaringBitmap>,
@ -100,7 +110,6 @@ impl<'a> FacetDistribution<'a> {
let mut lexicographic_distribution = BTreeMap::new(); let mut lexicographic_distribution = BTreeMap::new();
let mut key_buffer: Vec<_> = field_id.to_be_bytes().to_vec(); let mut key_buffer: Vec<_> = field_id.to_be_bytes().to_vec();
let distribution_prelength = distribution.len();
let db = self.index.field_id_docid_facet_f64s; let db = self.index.field_id_docid_facet_f64s;
for docid in candidates { for docid in candidates {
key_buffer.truncate(mem::size_of::<FieldId>()); key_buffer.truncate(mem::size_of::<FieldId>());
@ -113,23 +122,21 @@ impl<'a> FacetDistribution<'a> {
for result in iter { for result in iter {
let ((_, _, value), ()) = result?; let ((_, _, value), ()) = result?;
*lexicographic_distribution.entry(value.to_string()).or_insert(0) += 1; *lexicographic_distribution.entry(value.to_string()).or_insert(0) += 1;
if lexicographic_distribution.len() - distribution_prelength
== self.max_values_per_facet
{
break;
}
} }
} }
distribution.extend(lexicographic_distribution); distribution.extend(
lexicographic_distribution
.into_iter()
.take(self.max_values_per_facet.saturating_sub(distribution.len())),
);
} }
FacetType::String => { FacetType::String => {
let mut normalized_distribution = BTreeMap::new(); let mut normalized_distribution = BTreeMap::new();
let mut key_buffer: Vec<_> = field_id.to_be_bytes().to_vec(); let mut key_buffer: Vec<_> = field_id.to_be_bytes().to_vec();
let db = self.index.field_id_docid_facet_strings; let db = self.index.field_id_docid_facet_strings;
'outer: for docid in candidates { for docid in candidates {
key_buffer.truncate(mem::size_of::<FieldId>()); key_buffer.truncate(mem::size_of::<FieldId>());
key_buffer.extend_from_slice(&docid.to_be_bytes()); key_buffer.extend_from_slice(&docid.to_be_bytes());
let iter = db let iter = db
@ -144,14 +151,14 @@ impl<'a> FacetDistribution<'a> {
.or_insert_with(|| (original_value, 0)); .or_insert_with(|| (original_value, 0));
*count += 1; *count += 1;
if normalized_distribution.len() == self.max_values_per_facet { // we'd like to break here if we have enough facet values, but we are collecting them by increasing docid,
break 'outer; // so higher ranked facets could be in later docids
}
} }
} }
let iter = normalized_distribution let iter = normalized_distribution
.into_iter() .into_iter()
.take(self.max_values_per_facet.saturating_sub(distribution.len()))
.map(|(_normalized, (original, count))| (original.to_string(), count)); .map(|(_normalized, (original, count))| (original.to_string(), count));
distribution.extend(iter); distribution.extend(iter);
} }
@ -467,7 +474,7 @@ mod tests {
.execute() .execute()
.unwrap(); .unwrap();
milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 1}}"###); milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 2}}"###);
let map = FacetDistribution::new(&txn, &index) let map = FacetDistribution::new(&txn, &index)
.facets(iter::once(("colour", OrderBy::Count))) .facets(iter::once(("colour", OrderBy::Count)))