diff --git a/Cargo.lock b/Cargo.lock index 66b1fd10d..d6b4aebb2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1108,40 +1108,17 @@ dependencies = [ "syn", ] -[[package]] -name = "deserr" -version = "0.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86290491a2b5c21a1a5083da8dae831006761258fabd5617309c3eebc5f89468" -dependencies = [ - "deserr-internal 0.1.4", - "serde-cs", - "serde_json", -] - [[package]] name = "deserr" version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "28380303ca15ec07e1d5b079baf19cf849b09edad5cab219c1c51b2bd07523de" dependencies = [ - "deserr-internal 0.3.0", + "deserr-internal", "serde-cs", "serde_json", ] -[[package]] -name = "deserr-internal" -version = "0.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7131de1c27581bc376a22166c9f570be91b76cb096be2f6aecf224c27bf7c49a" -dependencies = [ - "convert_case 0.5.0", - "proc-macro2", - "quote", - "syn", -] - [[package]] name = "deserr-internal" version = "0.3.0" @@ -2477,7 +2454,7 @@ dependencies = [ "cargo_toml", "clap 4.0.32", "crossbeam-channel", - "deserr 0.3.0", + "deserr", "dump", "either", "env_logger", @@ -2569,7 +2546,7 @@ dependencies = [ "anyhow", "convert_case 0.6.0", "csv", - "deserr 0.3.0", + "deserr", "either", "enum-iterator", "file-store", @@ -2628,7 +2605,7 @@ dependencies = [ "concat-arrays", "crossbeam-channel", "csv", - "deserr 0.1.4", + "deserr", "either", "filter-parser", "flatten-serde-json", diff --git a/milli/Cargo.toml b/milli/Cargo.toml index b32592ab9..d68845be5 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -12,7 +12,7 @@ byteorder = "1.4.3" charabia = { version = "0.7.0", default-features = false } concat-arrays = "0.1.2" crossbeam-channel = "0.5.6" -deserr = "0.1.4" +deserr = "0.3.0" either = "1.8.0" flatten-serde-json = { path = "../flatten-serde-json" } fst = "0.4.7" diff --git a/milli/src/error.rs b/milli/src/error.rs index 8734cb540..f96c633f2 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -1,5 +1,6 @@ use std::collections::BTreeSet; use std::convert::Infallible; +use std::fmt::Write; use std::{io, str}; use heed::{Error as HeedError, MdbError}; @@ -100,10 +101,11 @@ A document identifier can be of type integer or string, \ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_).", .document_id.to_string() )] InvalidDocumentId { document_id: Value }, - #[error("Invalid facet distribution, the fields `{}` are not set as filterable.", - .invalid_facets_name.iter().map(AsRef::as_ref).collect::>().join(", ") - )] - InvalidFacetsDistribution { invalid_facets_name: BTreeSet }, + #[error("Invalid facet distribution, {}", format_invalid_filter_distribution(.invalid_facets_name, .valid_facets_name))] + InvalidFacetsDistribution { + invalid_facets_name: BTreeSet, + valid_facets_name: BTreeSet, + }, #[error(transparent)] InvalidGeoField(#[from] GeoError), #[error("{0}")] @@ -152,6 +154,8 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco pub enum GeoError { #[error("The `_geo` field in the document with the id: `{document_id}` is not an object. Was expecting an object with the `_geo.lat` and `_geo.lng` fields but instead got `{value}`.")] NotAnObject { document_id: Value, value: Value }, + #[error("The `_geo` field in the document with the id: `{document_id}` contains the following unexpected fields: `{value}`.")] + UnexpectedExtraFields { document_id: Value, value: Value }, #[error("Could not find latitude nor longitude in the document with the id: `{document_id}`. Was expecting `_geo.lat` and `_geo.lng` fields.")] MissingLatitudeAndLongitude { document_id: Value }, #[error("Could not find latitude in the document with the id: `{document_id}`. Was expecting a `_geo.lat` field.")] @@ -166,6 +170,50 @@ pub enum GeoError { BadLongitude { document_id: Value, value: Value }, } +fn format_invalid_filter_distribution( + invalid_facets_name: &BTreeSet, + valid_facets_name: &BTreeSet, +) -> String { + if valid_facets_name.is_empty() { + return "this index does not have configured filterable attributes.".into(); + } + + let mut result = String::new(); + + match invalid_facets_name.len() { + 0 => (), + 1 => write!( + result, + "attribute `{}` is not filterable.", + invalid_facets_name.first().unwrap() + ) + .unwrap(), + _ => write!( + result, + "attributes `{}` are not filterable.", + invalid_facets_name.iter().map(AsRef::as_ref).collect::>().join(", ") + ) + .unwrap(), + }; + + match valid_facets_name.len() { + 1 => write!( + result, + " The available filterable attribute is `{}`.", + valid_facets_name.first().unwrap() + ) + .unwrap(), + _ => write!( + result, + " The available filterable attributes are `{}`.", + valid_facets_name.iter().map(AsRef::as_ref).collect::>().join(", ") + ) + .unwrap(), + } + + result +} + /// A little macro helper to autogenerate From implementation that needs two `Into`. /// Given the following parameters: `error_from_sub_error!(FieldIdMapMissingEntry => InternalError)` /// the macro will create the following code: diff --git a/milli/src/index.rs b/milli/src/index.rs index 31311d318..efb408d19 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -426,7 +426,7 @@ impl Index { } /// Returns the `rtree` which associates coordinates to documents ids. - pub fn geo_rtree(&self, rtxn: &'_ RoTxn) -> Result>> { + pub fn geo_rtree(&self, rtxn: &RoTxn) -> Result>> { match self .main .get::<_, Str, SerdeBincode>>(rtxn, main_key::GEO_RTREE_KEY)? @@ -2292,4 +2292,69 @@ pub(crate) mod tests { assert!(all_ids.insert(id)); } } + + #[test] + fn bug_3007() { + // https://github.com/meilisearch/meilisearch/issues/3007 + + use crate::error::{GeoError, UserError}; + let index = TempIndex::new(); + + // Given is an index with a geo field NOT contained in the sortable_fields of the settings + index + .update_settings(|settings| { + settings.set_primary_key("id".to_string()); + settings.set_filterable_fields(HashSet::from(["_geo".to_string()])); + }) + .unwrap(); + + // happy path + index.add_documents(documents!({ "id" : 5, "_geo": {"lat": 12.0, "lng": 11.0}})).unwrap(); + + db_snap!(index, geo_faceted_documents_ids); + + // both are unparseable, we expect GeoError::BadLatitudeAndLongitude + let err1 = index + .add_documents( + documents!({ "id" : 6, "_geo": {"lat": "unparseable", "lng": "unparseable"}}), + ) + .unwrap_err(); + assert!(matches!( + err1, + Error::UserError(UserError::InvalidGeoField(GeoError::BadLatitudeAndLongitude { .. })) + )); + + db_snap!(index, geo_faceted_documents_ids); // ensure that no more document was inserted + } + + #[test] + fn unexpected_extra_fields_in_geo_field() { + let index = TempIndex::new(); + + index + .update_settings(|settings| { + settings.set_primary_key("id".to_string()); + settings.set_filterable_fields(HashSet::from(["_geo".to_string()])); + }) + .unwrap(); + + let err = index + .add_documents( + documents!({ "id" : "doggo", "_geo": { "lat": 1, "lng": 2, "doggo": "are the best" }}), + ) + .unwrap_err(); + insta::assert_display_snapshot!(err, @r###"The `_geo` field in the document with the id: `"\"doggo\""` contains the following unexpected fields: `{"doggo":"are the best"}`."###); + + db_snap!(index, geo_faceted_documents_ids); // ensure that no documents were inserted + + // multiple fields and complex values + let err = index + .add_documents( + documents!({ "id" : "doggo", "_geo": { "lat": 1, "lng": 2, "doggo": "are the best", "and": { "all": ["cats", { "are": "beautiful" } ] } } }), + ) + .unwrap_err(); + insta::assert_display_snapshot!(err, @r###"The `_geo` field in the document with the id: `"\"doggo\""` contains the following unexpected fields: `{"and":{"all":["cats",{"are":"beautiful"}]},"doggo":"are the best"}`."###); + + db_snap!(index, geo_faceted_documents_ids); // ensure that no documents were inserted + } } diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index 0f5b340bf..182f9fbea 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -183,14 +183,14 @@ impl<'t> Criterion for Proximity<'t> { } fn resolve_candidates( - ctx: &'_ dyn Context, + ctx: &dyn Context, query_tree: &Operation, proximity: u8, cache: &mut Cache, wdcache: &mut WordDerivationsCache, ) -> Result { fn resolve_operation( - ctx: &'_ dyn Context, + ctx: &dyn Context, query_tree: &Operation, proximity: u8, cache: &mut Cache, @@ -244,7 +244,7 @@ fn resolve_candidates( } fn mdfs_pair( - ctx: &'_ dyn Context, + ctx: &dyn Context, left: &Operation, right: &Operation, proximity: u8, @@ -299,7 +299,7 @@ fn resolve_candidates( } fn mdfs( - ctx: &'_ dyn Context, + ctx: &dyn Context, branches: &[Operation], proximity: u8, cache: &mut Cache, diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index 95e722074..69a210e7b 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -240,14 +240,14 @@ fn alterate_query_tree( } fn resolve_candidates( - ctx: &'_ dyn Context, + ctx: &dyn Context, query_tree: &Operation, number_typos: u8, cache: &mut HashMap<(Operation, u8), RoaringBitmap>, wdcache: &mut WordDerivationsCache, ) -> Result { fn resolve_operation( - ctx: &'_ dyn Context, + ctx: &dyn Context, query_tree: &Operation, number_typos: u8, cache: &mut HashMap<(Operation, u8), RoaringBitmap>, @@ -277,7 +277,7 @@ fn resolve_candidates( } fn mdfs( - ctx: &'_ dyn Context, + ctx: &dyn Context, branches: &[Operation], mana: u8, cache: &mut HashMap<(Operation, u8), RoaringBitmap>, diff --git a/milli/src/search/facet/facet_distribution.rs b/milli/src/search/facet/facet_distribution.rs index 43367abbb..4d5028ce0 100644 --- a/milli/src/search/facet/facet_distribution.rs +++ b/milli/src/search/facet/facet_distribution.rs @@ -291,6 +291,7 @@ impl<'a> FacetDistribution<'a> { if !invalid_fields.is_empty() { return Err(UserError::InvalidFacetsDistribution { invalid_facets_name: invalid_fields.into_iter().cloned().collect(), + valid_facets_name: filterable_fields.into_iter().collect(), } .into()); } else { diff --git a/milli/src/snapshots/index.rs/bug_3007/geo_faceted_documents_ids.snap b/milli/src/snapshots/index.rs/bug_3007/geo_faceted_documents_ids.snap new file mode 100644 index 000000000..f9ebc0c20 --- /dev/null +++ b/milli/src/snapshots/index.rs/bug_3007/geo_faceted_documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/index.rs +--- +[0, ] diff --git a/milli/src/snapshots/index.rs/unexpected_extra_fields_in_geo_field/geo_faceted_documents_ids.snap b/milli/src/snapshots/index.rs/unexpected_extra_fields_in_geo_field/geo_faceted_documents_ids.snap new file mode 100644 index 000000000..89fb1856a --- /dev/null +++ b/milli/src/snapshots/index.rs/unexpected_extra_fields_in_geo_field/geo_faceted_documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/index.rs +--- +[] diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 4ba7eb08f..daf186683 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -575,8 +575,8 @@ fn remove_from_word_docids( } fn remove_docids_from_field_id_docid_facet_value( - index: &'_ Index, - wtxn: &'_ mut heed::RwTxn, + index: &Index, + wtxn: &mut heed::RwTxn, facet_type: FacetType, field_id: FieldId, to_remove: &RoaringBitmap, diff --git a/milli/src/update/facet/incremental.rs b/milli/src/update/facet/incremental.rs index d5f09c783..aaef93b48 100644 --- a/milli/src/update/facet/incremental.rs +++ b/milli/src/update/facet/incremental.rs @@ -159,7 +159,7 @@ impl FacetsUpdateIncrementalInner { /// See documentation of `insert_in_level` fn insert_in_level_0( &self, - txn: &'_ mut RwTxn, + txn: &mut RwTxn, field_id: u16, facet_value: &[u8], docids: &RoaringBitmap, @@ -213,7 +213,7 @@ impl FacetsUpdateIncrementalInner { /// of the parent node should be incremented. fn insert_in_level( &self, - txn: &'_ mut RwTxn, + txn: &mut RwTxn, field_id: u16, level: u8, facet_value: &[u8], @@ -350,7 +350,7 @@ impl FacetsUpdateIncrementalInner { /// Insert the given facet value and corresponding document ids in the database. pub fn insert( &self, - txn: &'_ mut RwTxn, + txn: &mut RwTxn, field_id: u16, facet_value: &[u8], docids: &RoaringBitmap, @@ -472,7 +472,7 @@ impl FacetsUpdateIncrementalInner { /// its left bound as well. fn delete_in_level( &self, - txn: &'_ mut RwTxn, + txn: &mut RwTxn, field_id: u16, level: u8, facet_value: &[u8], @@ -531,7 +531,7 @@ impl FacetsUpdateIncrementalInner { fn delete_in_level_0( &self, - txn: &'_ mut RwTxn, + txn: &mut RwTxn, field_id: u16, facet_value: &[u8], docids: &RoaringBitmap, @@ -559,7 +559,7 @@ impl FacetsUpdateIncrementalInner { pub fn delete( &self, - txn: &'_ mut RwTxn, + txn: &mut RwTxn, field_id: u16, facet_value: &[u8], docids: &RoaringBitmap, diff --git a/milli/src/update/index_documents/enrich.rs b/milli/src/update/index_documents/enrich.rs index 3331497c9..ed04e9962 100644 --- a/milli/src/update/index_documents/enrich.rs +++ b/milli/src/update/index_documents/enrich.rs @@ -98,7 +98,12 @@ pub fn enrich_documents_batch( // If the settings specifies that a _geo field must be used therefore we must check the // validity of it in all the documents of this batch and this is when we return `Some`. let geo_field_id = match documents_batch_index.id("_geo") { - Some(geo_field_id) if index.sortable_fields(rtxn)?.contains("_geo") => Some(geo_field_id), + Some(geo_field_id) + if index.sortable_fields(rtxn)?.contains("_geo") + || index.filterable_fields(rtxn)?.contains("_geo") => + { + Some(geo_field_id) + } _otherwise => None, }; @@ -367,11 +372,17 @@ pub fn extract_finite_float_from_value(value: Value) -> StdResult { pub fn validate_geo_from_json(id: &DocumentId, bytes: &[u8]) -> Result> { use GeoError::*; - let debug_id = || Value::from(id.debug()); + let debug_id = || { + serde_json::from_slice(id.value().as_bytes()).unwrap_or_else(|_| Value::from(id.debug())) + }; match serde_json::from_slice(bytes).map_err(InternalError::SerdeJson)? { Value::Object(mut object) => match (object.remove("lat"), object.remove("lng")) { (Some(lat), Some(lng)) => { match (extract_finite_float_from_value(lat), extract_finite_float_from_value(lng)) { + (Ok(_), Ok(_)) if !object.is_empty() => Ok(Err(UnexpectedExtraFields { + document_id: debug_id(), + value: object.into(), + })), (Ok(_), Ok(_)) => Ok(Ok(())), (Err(value), Ok(_)) => Ok(Err(BadLatitude { document_id: debug_id(), value })), (Ok(_), Err(value)) => Ok(Err(BadLongitude { document_id: debug_id(), value })), diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index be97defbd..3e9edf3a2 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -965,34 +965,6 @@ mod tests { .unwrap(); } - #[test] - fn index_all_flavour_of_geo() { - let mut index = TempIndex::new(); - index.index_documents_config.update_method = IndexDocumentsMethod::ReplaceDocuments; - - index - .update_settings(|settings| { - settings.set_filterable_fields(hashset!(S("_geo"))); - }) - .unwrap(); - - index - .add_documents(documents!([ - { "id": 0, "_geo": { "lat": 31, "lng": [42] } }, - { "id": 1, "_geo": { "lat": "31" }, "_geo.lng": 42 }, - { "id": 2, "_geo": { "lng": "42" }, "_geo.lat": "31" }, - { "id": 3, "_geo.lat": 31, "_geo.lng": "42" }, - ])) - .unwrap(); - - let rtxn = index.read_txn().unwrap(); - - let mut search = crate::Search::new(&rtxn, &index); - search.filter(crate::Filter::from_str("_geoRadius(31, 42, 0.000001)").unwrap().unwrap()); - let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); - assert_eq!(documents_ids, vec![0, 1, 2, 3]); - } - #[test] fn geo_error() { let mut index = TempIndex::new(); diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 55f6c66fe..1646ab5b2 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -37,9 +37,6 @@ where _ => T::deserialize_from_value(value, location).map(Setting::Set), } } - fn default() -> Option { - Some(Self::NotSet) - } } impl Default for Setting { diff --git a/milli/src/update/words_prefix_position_docids.rs b/milli/src/update/words_prefix_position_docids.rs index 5dbc9f89b..6f12dde38 100644 --- a/milli/src/update/words_prefix_position_docids.rs +++ b/milli/src/update/words_prefix_position_docids.rs @@ -140,16 +140,20 @@ impl<'t, 'u, 'i> WordPrefixPositionDocids<'t, 'u, 'i> { // We remove all the entries that are no more required in this word prefix position // docids database. - let mut iter = - self.index.word_prefix_position_docids.iter_mut(self.wtxn)?.lazily_decode_data(); - while let Some(((prefix, _), _)) = iter.next().transpose()? { - if del_prefix_fst_words.contains(prefix.as_bytes()) { - unsafe { iter.del_current()? }; + // We also avoid iterating over the whole `word_prefix_position_docids` database if we know in + // advance that the `if del_prefix_fst_words.contains(prefix.as_bytes()) {` condition below + // will always be false (i.e. if `del_prefix_fst_words` is empty). + if !del_prefix_fst_words.is_empty() { + let mut iter = + self.index.word_prefix_position_docids.iter_mut(self.wtxn)?.lazily_decode_data(); + while let Some(((prefix, _), _)) = iter.next().transpose()? { + if del_prefix_fst_words.contains(prefix.as_bytes()) { + unsafe { iter.del_current()? }; + } } + drop(iter); } - drop(iter); - // We finally write all the word prefix position docids into the LMDB database. sorter_into_lmdb_database( self.wtxn,