diff --git a/crates/meilisearch-types/src/error.rs b/crates/meilisearch-types/src/error.rs index f64301b8c..7b4a66df5 100644 --- a/crates/meilisearch-types/src/error.rs +++ b/crates/meilisearch-types/src/error.rs @@ -241,6 +241,7 @@ InvalidDocumentGeoField , InvalidRequest , BAD_REQUEST ; InvalidVectorDimensions , InvalidRequest , BAD_REQUEST ; InvalidVectorsType , InvalidRequest , BAD_REQUEST ; InvalidDocumentId , InvalidRequest , BAD_REQUEST ; +InvalidDocumentIds , InvalidRequest , BAD_REQUEST ; InvalidDocumentLimit , InvalidRequest , BAD_REQUEST ; InvalidDocumentOffset , InvalidRequest , BAD_REQUEST ; InvalidSearchEmbedder , InvalidRequest , BAD_REQUEST ; @@ -383,6 +384,7 @@ UnsupportedMediaType , InvalidRequest , UNSUPPORTED_MEDIA // Experimental features VectorEmbeddingError , InvalidRequest , BAD_REQUEST ; NotFoundSimilarId , InvalidRequest , BAD_REQUEST ; +NotFoundDocumentId , InvalidRequest , BAD_REQUEST ; InvalidDocumentEditionContext , InvalidRequest , BAD_REQUEST ; InvalidDocumentEditionFunctionFilter , InvalidRequest , BAD_REQUEST ; EditDocumentsByFunctionError , InvalidRequest , BAD_REQUEST diff --git a/crates/meilisearch/src/routes/indexes/documents.rs b/crates/meilisearch/src/routes/indexes/documents.rs index 54f01b4d6..8f063b31c 100644 --- a/crates/meilisearch/src/routes/indexes/documents.rs +++ b/crates/meilisearch/src/routes/indexes/documents.rs @@ -20,11 +20,13 @@ use meilisearch_types::index_uid::IndexUid; use meilisearch_types::milli::update::IndexDocumentsMethod; use meilisearch_types::milli::vector::parsed_vectors::ExplicitVectors; use meilisearch_types::milli::DocumentId; +use meilisearch_types::serde_cs::vec::CS; use meilisearch_types::star_or::OptionStarOrList; use meilisearch_types::tasks::KindWithContent; use meilisearch_types::{milli, Document, Index}; use mime::Mime; use once_cell::sync::Lazy; +use roaring::RoaringBitmap; use serde::{Deserialize, Serialize}; use serde_json::Value; use tempfile::tempfile; @@ -43,7 +45,7 @@ use crate::extractors::sequential_extractor::SeqHandler; use crate::routes::{ get_task_id, is_dry_run, PaginationView, SummarizedTaskView, PAGINATION_DEFAULT_LIMIT, }; -use crate::search::{parse_filter, RetrieveVectors}; +use crate::search::{parse_filter, ExternalDocumentId, RetrieveVectors}; use crate::{aggregate_methods, Opt}; static ACCEPTED_CONTENT_TYPE: Lazy> = Lazy::new(|| { @@ -387,6 +389,9 @@ pub struct BrowseQueryGet { #[param(default, value_type = Option)] #[deserr(default, error = DeserrQueryParamError)] retrieve_vectors: Param, + #[param(default, value_type = Option>)] + #[deserr(default, error = DeserrQueryParamError)] + ids: Option>, #[param(default, value_type = Option, example = "popularity > 1000")] #[deserr(default, error = DeserrQueryParamError)] filter: Option, @@ -408,6 +413,9 @@ pub struct BrowseQuery { #[schema(default, example = true)] #[deserr(default, error = DeserrJsonError)] retrieve_vectors: bool, + #[schema(value_type = Option>, example = json!(["cody", "finn", "brandy", "gambit"]))] + #[deserr(default, error = DeserrJsonError)] + ids: Option>, #[schema(default, value_type = Option, example = "popularity > 1000")] #[deserr(default, error = DeserrJsonError)] filter: Option, @@ -551,7 +559,8 @@ pub async fn get_documents( ) -> Result { debug!(parameters = ?params, "Get documents GET"); - let BrowseQueryGet { limit, offset, fields, retrieve_vectors, filter } = params.into_inner(); + let BrowseQueryGet { limit, offset, fields, retrieve_vectors, filter, ids } = + params.into_inner(); let filter = match filter { Some(f) => match serde_json::from_str(&f) { @@ -561,12 +570,15 @@ pub async fn get_documents( None => None, }; + let ids = ids.map(|ids| ids.into_iter().map(Into::into).collect()); + let query = BrowseQuery { offset: offset.0, limit: limit.0, fields: fields.merge_star_and_none(), retrieve_vectors: retrieve_vectors.0, filter, + ids, }; analytics.publish( @@ -590,15 +602,30 @@ fn documents_by_query( query: BrowseQuery, ) -> Result { let index_uid = IndexUid::try_from(index_uid.into_inner())?; - let BrowseQuery { offset, limit, fields, retrieve_vectors, filter } = query; + let BrowseQuery { offset, limit, fields, retrieve_vectors, filter, ids } = query; let retrieve_vectors = RetrieveVectors::new(retrieve_vectors); + let ids = if let Some(ids) = ids { + let mut parsed_ids = Vec::with_capacity(ids.len()); + for (index, id) in ids.into_iter().enumerate() { + let id = id.try_into().map_err(|error| { + let msg = format!("In `.ids[{index}]`:{error}"); + ResponseError::from_msg(msg, Code::InvalidDocumentIds) + })?; + parsed_ids.push(id) + } + Some(parsed_ids) + } else { + None + }; + let index = index_scheduler.index(&index_uid)?; let (total, documents) = retrieve_documents( &index, offset, limit, + ids, filter, fields, retrieve_vectors, @@ -1451,10 +1478,12 @@ fn some_documents<'a, 't: 'a>( })) } +#[allow(clippy::too_many_arguments)] fn retrieve_documents>( index: &Index, offset: usize, limit: usize, + ids: Option>, filter: Option, attributes_to_retrieve: Option>, retrieve_vectors: RetrieveVectors, @@ -1468,16 +1497,30 @@ fn retrieve_documents>( None }; - let candidates = if let Some(filter) = filter { - filter.evaluate(&rtxn, index).map_err(|err| match err { + let mut candidates = if let Some(ids) = ids { + let external_document_ids = index.external_documents_ids(); + let mut candidates = RoaringBitmap::new(); + for (index, id) in ids.iter().enumerate() { + let Some(docid) = external_document_ids.get(&rtxn, id)? else { + let error = MeilisearchHttpError::DocumentNotFound(id.clone().into_inner()); + let msg = format!("In `.ids[{index}]`: {error}"); + return Err(ResponseError::from_msg(msg, Code::NotFoundDocumentId)); + }; + candidates.insert(docid); + } + candidates + } else { + index.documents_ids(&rtxn)? + }; + + if let Some(filter) = filter { + candidates &= filter.evaluate(&rtxn, index).map_err(|err| match err { milli::Error::UserError(milli::UserError::InvalidFilter(_)) => { ResponseError::from_msg(err.to_string(), Code::InvalidDocumentFilter) } e => e.into(), })? - } else { - index.documents_ids(&rtxn)? - }; + } let (it, number_of_documents) = { let number_of_documents = candidates.len();