Merge remote-tracking branch 'milli/main' into bring-v1-changes

2025-02-20 17:45:54 +08:00 · 2023-02-06 16:48:10 +01:00 · 2023-02-06 16:48:10 +01:00 · fbec48f56e
commit fbec48f56e
parent a377a49218 758b4acea7
15 changed files with 171 additions and 88 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -1108,40 +1108,17 @@ dependencies = [
 "syn",
 ]

-[[package]]
-name = "deserr"
-version = "0.1.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "86290491a2b5c21a1a5083da8dae831006761258fabd5617309c3eebc5f89468"
-dependencies = [
- "deserr-internal 0.1.4",
- "serde-cs",
- "serde_json",
-]
-
 [[package]]
 name = "deserr"
 version = "0.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "28380303ca15ec07e1d5b079baf19cf849b09edad5cab219c1c51b2bd07523de"
 dependencies = [
- "deserr-internal 0.3.0",
+ "deserr-internal",
 "serde-cs",
 "serde_json",
 ]

-[[package]]
-name = "deserr-internal"
-version = "0.1.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7131de1c27581bc376a22166c9f570be91b76cb096be2f6aecf224c27bf7c49a"
-dependencies = [
- "convert_case 0.5.0",
- "proc-macro2",
- "quote",
- "syn",
-]
-
 [[package]]
 name = "deserr-internal"
 version = "0.3.0"
@ -2477,7 +2454,7 @@ dependencies = [
 "cargo_toml",
 "clap 4.0.32",
 "crossbeam-channel",
- "deserr 0.3.0",
+ "deserr",
 "dump",
 "either",
 "env_logger",
@ -2569,7 +2546,7 @@ dependencies = [
 "anyhow",
 "convert_case 0.6.0",
 "csv",
- "deserr 0.3.0",
+ "deserr",
 "either",
 "enum-iterator",
 "file-store",
@ -2628,7 +2605,7 @@ dependencies = [
 "concat-arrays",
 "crossbeam-channel",
 "csv",
- "deserr 0.1.4",
+ "deserr",
 "either",
 "filter-parser",
 "flatten-serde-json",
--- a/milli/Cargo.toml
+++ b/milli/Cargo.toml
@ -12,7 +12,7 @@ byteorder = "1.4.3"
 charabia = { version = "0.7.0", default-features = false }
 concat-arrays = "0.1.2"
 crossbeam-channel = "0.5.6"
-deserr = "0.1.4"
+deserr = "0.3.0"
 either = "1.8.0"
 flatten-serde-json = { path = "../flatten-serde-json" }
 fst = "0.4.7"
--- a/milli/src/error.rs
+++ b/milli/src/error.rs
@ -1,5 +1,6 @@
 use std::collections::BTreeSet;
 use std::convert::Infallible;
+use std::fmt::Write;
 use std::{io, str};

 use heed::{Error as HeedError, MdbError};
@ -100,10 +101,11 @@ A document identifier can be of type integer or string, \
 only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_).", .document_id.to_string()
    )]
    InvalidDocumentId { document_id: Value },
-    #[error("Invalid facet distribution, the fields `{}` are not set as filterable.",
-        .invalid_facets_name.iter().map(AsRef::as_ref).collect::<Vec<&str>>().join(", ")
-     )]
-    InvalidFacetsDistribution { invalid_facets_name: BTreeSet<String> },
+    #[error("Invalid facet distribution, {}", format_invalid_filter_distribution(.invalid_facets_name, .valid_facets_name))]
+    InvalidFacetsDistribution {
+        invalid_facets_name: BTreeSet<String>,
+        valid_facets_name: BTreeSet<String>,
+    },
    #[error(transparent)]
    InvalidGeoField(#[from] GeoError),
    #[error("{0}")]
@ -152,6 +154,8 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco
 pub enum GeoError {
    #[error("The `_geo` field in the document with the id: `{document_id}` is not an object. Was expecting an object with the `_geo.lat` and `_geo.lng` fields but instead got `{value}`.")]
    NotAnObject { document_id: Value, value: Value },
+    #[error("The `_geo` field in the document with the id: `{document_id}` contains the following unexpected fields: `{value}`.")]
+    UnexpectedExtraFields { document_id: Value, value: Value },
    #[error("Could not find latitude nor longitude in the document with the id: `{document_id}`. Was expecting `_geo.lat` and `_geo.lng` fields.")]
    MissingLatitudeAndLongitude { document_id: Value },
    #[error("Could not find latitude in the document with the id: `{document_id}`. Was expecting a `_geo.lat` field.")]
@ -166,6 +170,50 @@ pub enum GeoError {
    BadLongitude { document_id: Value, value: Value },
 }

+fn format_invalid_filter_distribution(
+    invalid_facets_name: &BTreeSet<String>,
+    valid_facets_name: &BTreeSet<String>,
+) -> String {
+    if valid_facets_name.is_empty() {
+        return "this index does not have configured filterable attributes.".into();
+    }
+
+    let mut result = String::new();
+
+    match invalid_facets_name.len() {
+        0 => (),
+        1 => write!(
+            result,
+            "attribute `{}` is not filterable.",
+            invalid_facets_name.first().unwrap()
+        )
+        .unwrap(),
+        _ => write!(
+            result,
+            "attributes `{}` are not filterable.",
+            invalid_facets_name.iter().map(AsRef::as_ref).collect::<Vec<&str>>().join(", ")
+        )
+        .unwrap(),
+    };
+
+    match valid_facets_name.len() {
+        1 => write!(
+            result,
+            " The available filterable attribute is `{}`.",
+            valid_facets_name.first().unwrap()
+        )
+        .unwrap(),
+        _ => write!(
+            result,
+            " The available filterable attributes are `{}`.",
+            valid_facets_name.iter().map(AsRef::as_ref).collect::<Vec<&str>>().join(", ")
+        )
+        .unwrap(),
+    }
+
+    result
+}
+
 /// A little macro helper to autogenerate From implementation that needs two `Into`.
 /// Given the following parameters: `error_from_sub_error!(FieldIdMapMissingEntry => InternalError)`
 /// the macro will create the following code:
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@ -426,7 +426,7 @@ impl Index {
    }

    /// Returns the `rtree` which associates coordinates to documents ids.
-    pub fn geo_rtree(&self, rtxn: &'_ RoTxn) -> Result<Option<RTree<GeoPoint>>> {
+    pub fn geo_rtree(&self, rtxn: &RoTxn) -> Result<Option<RTree<GeoPoint>>> {
        match self
            .main
            .get::<_, Str, SerdeBincode<RTree<GeoPoint>>>(rtxn, main_key::GEO_RTREE_KEY)?
@ -2292,4 +2292,69 @@ pub(crate) mod tests {
            assert!(all_ids.insert(id));
        }
    }
+
+    #[test]
+    fn bug_3007() {
+        // https://github.com/meilisearch/meilisearch/issues/3007
+
+        use crate::error::{GeoError, UserError};
+        let index = TempIndex::new();
+
+        // Given is an index with a geo field NOT contained in the sortable_fields of the settings
+        index
+            .update_settings(|settings| {
+                settings.set_primary_key("id".to_string());
+                settings.set_filterable_fields(HashSet::from(["_geo".to_string()]));
+            })
+            .unwrap();
+
+        // happy path
+        index.add_documents(documents!({ "id" : 5, "_geo": {"lat": 12.0, "lng": 11.0}})).unwrap();
+
+        db_snap!(index, geo_faceted_documents_ids);
+
+        // both are unparseable, we expect GeoError::BadLatitudeAndLongitude
+        let err1 = index
+            .add_documents(
+                documents!({ "id" : 6, "_geo": {"lat": "unparseable", "lng": "unparseable"}}),
+            )
+            .unwrap_err();
+        assert!(matches!(
+            err1,
+            Error::UserError(UserError::InvalidGeoField(GeoError::BadLatitudeAndLongitude { .. }))
+        ));
+
+        db_snap!(index, geo_faceted_documents_ids); // ensure that no more document was inserted
+    }
+
+    #[test]
+    fn unexpected_extra_fields_in_geo_field() {
+        let index = TempIndex::new();
+
+        index
+            .update_settings(|settings| {
+                settings.set_primary_key("id".to_string());
+                settings.set_filterable_fields(HashSet::from(["_geo".to_string()]));
+            })
+            .unwrap();
+
+        let err = index
+            .add_documents(
+                documents!({ "id" : "doggo", "_geo": { "lat": 1, "lng": 2, "doggo": "are the best" }}),
+            )
+            .unwrap_err();
+        insta::assert_display_snapshot!(err, @r###"The `_geo` field in the document with the id: `"\"doggo\""` contains the following unexpected fields: `{"doggo":"are the best"}`."###);
+
+        db_snap!(index, geo_faceted_documents_ids); // ensure that no documents were inserted
+
+        // multiple fields and complex values
+        let err = index
+            .add_documents(
+                documents!({ "id" : "doggo", "_geo": { "lat": 1, "lng": 2, "doggo": "are the best", "and": { "all": ["cats", { "are": "beautiful" } ] } } }),
+            )
+            .unwrap_err();
+        insta::assert_display_snapshot!(err, @r###"The `_geo` field in the document with the id: `"\"doggo\""` contains the following unexpected fields: `{"and":{"all":["cats",{"are":"beautiful"}]},"doggo":"are the best"}`."###);
+
+        db_snap!(index, geo_faceted_documents_ids); // ensure that no documents were inserted
+    }
 }
--- a/milli/src/search/criteria/proximity.rs
+++ b/milli/src/search/criteria/proximity.rs
@ -183,14 +183,14 @@ impl<'t> Criterion for Proximity<'t> {
 }

 fn resolve_candidates(
-    ctx: &'_ dyn Context,
+    ctx: &dyn Context,
    query_tree: &Operation,
    proximity: u8,
    cache: &mut Cache,
    wdcache: &mut WordDerivationsCache,
 ) -> Result<RoaringBitmap> {
    fn resolve_operation(
-        ctx: &'_ dyn Context,
+        ctx: &dyn Context,
        query_tree: &Operation,
        proximity: u8,
        cache: &mut Cache,
@ -244,7 +244,7 @@ fn resolve_candidates(
    }

    fn mdfs_pair(
-        ctx: &'_ dyn Context,
+        ctx: &dyn Context,
        left: &Operation,
        right: &Operation,
        proximity: u8,
@ -299,7 +299,7 @@ fn resolve_candidates(
    }

    fn mdfs(
-        ctx: &'_ dyn Context,
+        ctx: &dyn Context,
        branches: &[Operation],
        proximity: u8,
        cache: &mut Cache,
--- a/milli/src/search/criteria/typo.rs
+++ b/milli/src/search/criteria/typo.rs
@ -240,14 +240,14 @@ fn alterate_query_tree(
 }

 fn resolve_candidates(
-    ctx: &'_ dyn Context,
+    ctx: &dyn Context,
    query_tree: &Operation,
    number_typos: u8,
    cache: &mut HashMap<(Operation, u8), RoaringBitmap>,
    wdcache: &mut WordDerivationsCache,
 ) -> Result<RoaringBitmap> {
    fn resolve_operation(
-        ctx: &'_ dyn Context,
+        ctx: &dyn Context,
        query_tree: &Operation,
        number_typos: u8,
        cache: &mut HashMap<(Operation, u8), RoaringBitmap>,
@ -277,7 +277,7 @@ fn resolve_candidates(
    }

    fn mdfs(
-        ctx: &'_ dyn Context,
+        ctx: &dyn Context,
        branches: &[Operation],
        mana: u8,
        cache: &mut HashMap<(Operation, u8), RoaringBitmap>,
--- a/milli/src/search/facet/facet_distribution.rs
+++ b/milli/src/search/facet/facet_distribution.rs
@ -291,6 +291,7 @@ impl<'a> FacetDistribution<'a> {
                if !invalid_fields.is_empty() {
                    return Err(UserError::InvalidFacetsDistribution {
                        invalid_facets_name: invalid_fields.into_iter().cloned().collect(),
+                        valid_facets_name: filterable_fields.into_iter().collect(),
                    }
                    .into());
                } else {
--- a/milli/src/snapshots/index.rs/bug_3007/geo_faceted_documents_ids.snap
+++ b/milli/src/snapshots/index.rs/bug_3007/geo_faceted_documents_ids.snap
@ -0,0 +1,4 @@
+---
+source: milli/src/index.rs
+---
+[0, ]
--- a/milli/src/snapshots/index.rs/unexpected_extra_fields_in_geo_field/geo_faceted_documents_ids.snap
+++ b/milli/src/snapshots/index.rs/unexpected_extra_fields_in_geo_field/geo_faceted_documents_ids.snap
@ -0,0 +1,4 @@
+---
+source: milli/src/index.rs
+---
+[]
--- a/milli/src/update/delete_documents.rs
+++ b/milli/src/update/delete_documents.rs
@ -575,8 +575,8 @@ fn remove_from_word_docids(
 }

 fn remove_docids_from_field_id_docid_facet_value(
-    index: &'_ Index,
-    wtxn: &'_ mut heed::RwTxn,
+    index: &Index,
+    wtxn: &mut heed::RwTxn,
    facet_type: FacetType,
    field_id: FieldId,
    to_remove: &RoaringBitmap,
--- a/milli/src/update/facet/incremental.rs
+++ b/milli/src/update/facet/incremental.rs
@ -159,7 +159,7 @@ impl FacetsUpdateIncrementalInner {
    /// See documentation of `insert_in_level`
    fn insert_in_level_0(
        &self,
-        txn: &'_ mut RwTxn,
+        txn: &mut RwTxn,
        field_id: u16,
        facet_value: &[u8],
        docids: &RoaringBitmap,
@ -213,7 +213,7 @@ impl FacetsUpdateIncrementalInner {
    /// of the parent node should be incremented.
    fn insert_in_level(
        &self,
-        txn: &'_ mut RwTxn,
+        txn: &mut RwTxn,
        field_id: u16,
        level: u8,
        facet_value: &[u8],
@ -350,7 +350,7 @@ impl FacetsUpdateIncrementalInner {
    /// Insert the given facet value and corresponding document ids in the database.
    pub fn insert(
        &self,
-        txn: &'_ mut RwTxn,
+        txn: &mut RwTxn,
        field_id: u16,
        facet_value: &[u8],
        docids: &RoaringBitmap,
@ -472,7 +472,7 @@ impl FacetsUpdateIncrementalInner {
    /// its left bound as well.
    fn delete_in_level(
        &self,
-        txn: &'_ mut RwTxn,
+        txn: &mut RwTxn,
        field_id: u16,
        level: u8,
        facet_value: &[u8],
@ -531,7 +531,7 @@ impl FacetsUpdateIncrementalInner {

    fn delete_in_level_0(
        &self,
-        txn: &'_ mut RwTxn,
+        txn: &mut RwTxn,
        field_id: u16,
        facet_value: &[u8],
        docids: &RoaringBitmap,
@ -559,7 +559,7 @@ impl FacetsUpdateIncrementalInner {

    pub fn delete(
        &self,
-        txn: &'_ mut RwTxn,
+        txn: &mut RwTxn,
        field_id: u16,
        facet_value: &[u8],
        docids: &RoaringBitmap,
--- a/milli/src/update/index_documents/enrich.rs
+++ b/milli/src/update/index_documents/enrich.rs
@ -98,7 +98,12 @@ pub fn enrich_documents_batch<R: Read + Seek>(
    // If the settings specifies that a _geo field must be used therefore we must check the
    // validity of it in all the documents of this batch and this is when we return `Some`.
    let geo_field_id = match documents_batch_index.id("_geo") {
-        Some(geo_field_id) if index.sortable_fields(rtxn)?.contains("_geo") => Some(geo_field_id),
+        Some(geo_field_id)
+            if index.sortable_fields(rtxn)?.contains("_geo")
+                || index.filterable_fields(rtxn)?.contains("_geo") =>
+        {
+            Some(geo_field_id)
+        }
        _otherwise => None,
    };

@ -367,11 +372,17 @@ pub fn extract_finite_float_from_value(value: Value) -> StdResult<f64, Value> {

 pub fn validate_geo_from_json(id: &DocumentId, bytes: &[u8]) -> Result<StdResult<(), GeoError>> {
    use GeoError::*;
-    let debug_id = || Value::from(id.debug());
+    let debug_id = || {
+        serde_json::from_slice(id.value().as_bytes()).unwrap_or_else(|_| Value::from(id.debug()))
+    };
    match serde_json::from_slice(bytes).map_err(InternalError::SerdeJson)? {
        Value::Object(mut object) => match (object.remove("lat"), object.remove("lng")) {
            (Some(lat), Some(lng)) => {
                match (extract_finite_float_from_value(lat), extract_finite_float_from_value(lng)) {
+                    (Ok(_), Ok(_)) if !object.is_empty() => Ok(Err(UnexpectedExtraFields {
+                        document_id: debug_id(),
+                        value: object.into(),
+                    })),
                    (Ok(_), Ok(_)) => Ok(Ok(())),
                    (Err(value), Ok(_)) => Ok(Err(BadLatitude { document_id: debug_id(), value })),
                    (Ok(_), Err(value)) => Ok(Err(BadLongitude { document_id: debug_id(), value })),
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@ -965,34 +965,6 @@ mod tests {
            .unwrap();
    }

-    #[test]
-    fn index_all_flavour_of_geo() {
-        let mut index = TempIndex::new();
-        index.index_documents_config.update_method = IndexDocumentsMethod::ReplaceDocuments;
-
-        index
-            .update_settings(|settings| {
-                settings.set_filterable_fields(hashset!(S("_geo")));
-            })
-            .unwrap();
-
-        index
-            .add_documents(documents!([
-              { "id": 0, "_geo": { "lat": 31, "lng": [42] } },
-              { "id": 1, "_geo": { "lat": "31" }, "_geo.lng": 42 },
-              { "id": 2, "_geo": { "lng": "42" }, "_geo.lat": "31" },
-              { "id": 3, "_geo.lat": 31, "_geo.lng": "42" },
-            ]))
-            .unwrap();
-
-        let rtxn = index.read_txn().unwrap();
-
-        let mut search = crate::Search::new(&rtxn, &index);
-        search.filter(crate::Filter::from_str("_geoRadius(31, 42, 0.000001)").unwrap().unwrap());
-        let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
-        assert_eq!(documents_ids, vec![0, 1, 2, 3]);
-    }
-
    #[test]
    fn geo_error() {
        let mut index = TempIndex::new();
--- a/milli/src/update/settings.rs
+++ b/milli/src/update/settings.rs
@ -37,9 +37,6 @@ where
            _ => T::deserialize_from_value(value, location).map(Setting::Set),
        }
    }
-    fn default() -> Option<Self> {
-        Some(Self::NotSet)
-    }
 }

 impl<T> Default for Setting<T> {
--- a/milli/src/update/words_prefix_position_docids.rs
+++ b/milli/src/update/words_prefix_position_docids.rs
@ -140,16 +140,20 @@ impl<'t, 'u, 'i> WordPrefixPositionDocids<'t, 'u, 'i> {

        // We remove all the entries that are no more required in this word prefix position
        // docids database.
-        let mut iter =
-            self.index.word_prefix_position_docids.iter_mut(self.wtxn)?.lazily_decode_data();
-        while let Some(((prefix, _), _)) = iter.next().transpose()? {
-            if del_prefix_fst_words.contains(prefix.as_bytes()) {
-                unsafe { iter.del_current()? };
+        // We also avoid iterating over the whole `word_prefix_position_docids` database if we know in
+        // advance that the `if del_prefix_fst_words.contains(prefix.as_bytes()) {` condition below
+        // will always be false (i.e. if `del_prefix_fst_words` is empty).
+        if !del_prefix_fst_words.is_empty() {
+            let mut iter =
+                self.index.word_prefix_position_docids.iter_mut(self.wtxn)?.lazily_decode_data();
+            while let Some(((prefix, _), _)) = iter.next().transpose()? {
+                if del_prefix_fst_words.contains(prefix.as_bytes()) {
+                    unsafe { iter.del_current()? };
+                }
            }
+            drop(iter);
        }

-        drop(iter);
-
        // We finally write all the word prefix position docids into the LMDB database.
        sorter_into_lmdb_database(
            self.wtxn,