Merge remote-tracking branch 'milli/main' into bring-v1-changes

This commit is contained in:
Kerollmops 2023-02-06 16:48:10 +01:00
commit fbec48f56e
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4
15 changed files with 171 additions and 88 deletions

31
Cargo.lock generated
View File

@ -1108,40 +1108,17 @@ dependencies = [
"syn",
]
[[package]]
name = "deserr"
version = "0.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "86290491a2b5c21a1a5083da8dae831006761258fabd5617309c3eebc5f89468"
dependencies = [
"deserr-internal 0.1.4",
"serde-cs",
"serde_json",
]
[[package]]
name = "deserr"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "28380303ca15ec07e1d5b079baf19cf849b09edad5cab219c1c51b2bd07523de"
dependencies = [
"deserr-internal 0.3.0",
"deserr-internal",
"serde-cs",
"serde_json",
]
[[package]]
name = "deserr-internal"
version = "0.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7131de1c27581bc376a22166c9f570be91b76cb096be2f6aecf224c27bf7c49a"
dependencies = [
"convert_case 0.5.0",
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "deserr-internal"
version = "0.3.0"
@ -2477,7 +2454,7 @@ dependencies = [
"cargo_toml",
"clap 4.0.32",
"crossbeam-channel",
"deserr 0.3.0",
"deserr",
"dump",
"either",
"env_logger",
@ -2569,7 +2546,7 @@ dependencies = [
"anyhow",
"convert_case 0.6.0",
"csv",
"deserr 0.3.0",
"deserr",
"either",
"enum-iterator",
"file-store",
@ -2628,7 +2605,7 @@ dependencies = [
"concat-arrays",
"crossbeam-channel",
"csv",
"deserr 0.1.4",
"deserr",
"either",
"filter-parser",
"flatten-serde-json",

View File

@ -12,7 +12,7 @@ byteorder = "1.4.3"
charabia = { version = "0.7.0", default-features = false }
concat-arrays = "0.1.2"
crossbeam-channel = "0.5.6"
deserr = "0.1.4"
deserr = "0.3.0"
either = "1.8.0"
flatten-serde-json = { path = "../flatten-serde-json" }
fst = "0.4.7"

View File

@ -1,5 +1,6 @@
use std::collections::BTreeSet;
use std::convert::Infallible;
use std::fmt::Write;
use std::{io, str};
use heed::{Error as HeedError, MdbError};
@ -100,10 +101,11 @@ A document identifier can be of type integer or string, \
only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_).", .document_id.to_string()
)]
InvalidDocumentId { document_id: Value },
#[error("Invalid facet distribution, the fields `{}` are not set as filterable.",
.invalid_facets_name.iter().map(AsRef::as_ref).collect::<Vec<&str>>().join(", ")
)]
InvalidFacetsDistribution { invalid_facets_name: BTreeSet<String> },
#[error("Invalid facet distribution, {}", format_invalid_filter_distribution(.invalid_facets_name, .valid_facets_name))]
InvalidFacetsDistribution {
invalid_facets_name: BTreeSet<String>,
valid_facets_name: BTreeSet<String>,
},
#[error(transparent)]
InvalidGeoField(#[from] GeoError),
#[error("{0}")]
@ -152,6 +154,8 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco
pub enum GeoError {
#[error("The `_geo` field in the document with the id: `{document_id}` is not an object. Was expecting an object with the `_geo.lat` and `_geo.lng` fields but instead got `{value}`.")]
NotAnObject { document_id: Value, value: Value },
#[error("The `_geo` field in the document with the id: `{document_id}` contains the following unexpected fields: `{value}`.")]
UnexpectedExtraFields { document_id: Value, value: Value },
#[error("Could not find latitude nor longitude in the document with the id: `{document_id}`. Was expecting `_geo.lat` and `_geo.lng` fields.")]
MissingLatitudeAndLongitude { document_id: Value },
#[error("Could not find latitude in the document with the id: `{document_id}`. Was expecting a `_geo.lat` field.")]
@ -166,6 +170,50 @@ pub enum GeoError {
BadLongitude { document_id: Value, value: Value },
}
fn format_invalid_filter_distribution(
invalid_facets_name: &BTreeSet<String>,
valid_facets_name: &BTreeSet<String>,
) -> String {
if valid_facets_name.is_empty() {
return "this index does not have configured filterable attributes.".into();
}
let mut result = String::new();
match invalid_facets_name.len() {
0 => (),
1 => write!(
result,
"attribute `{}` is not filterable.",
invalid_facets_name.first().unwrap()
)
.unwrap(),
_ => write!(
result,
"attributes `{}` are not filterable.",
invalid_facets_name.iter().map(AsRef::as_ref).collect::<Vec<&str>>().join(", ")
)
.unwrap(),
};
match valid_facets_name.len() {
1 => write!(
result,
" The available filterable attribute is `{}`.",
valid_facets_name.first().unwrap()
)
.unwrap(),
_ => write!(
result,
" The available filterable attributes are `{}`.",
valid_facets_name.iter().map(AsRef::as_ref).collect::<Vec<&str>>().join(", ")
)
.unwrap(),
}
result
}
/// A little macro helper to autogenerate From implementation that needs two `Into`.
/// Given the following parameters: `error_from_sub_error!(FieldIdMapMissingEntry => InternalError)`
/// the macro will create the following code:

View File

@ -426,7 +426,7 @@ impl Index {
}
/// Returns the `rtree` which associates coordinates to documents ids.
pub fn geo_rtree(&self, rtxn: &'_ RoTxn) -> Result<Option<RTree<GeoPoint>>> {
pub fn geo_rtree(&self, rtxn: &RoTxn) -> Result<Option<RTree<GeoPoint>>> {
match self
.main
.get::<_, Str, SerdeBincode<RTree<GeoPoint>>>(rtxn, main_key::GEO_RTREE_KEY)?
@ -2292,4 +2292,69 @@ pub(crate) mod tests {
assert!(all_ids.insert(id));
}
}
#[test]
fn bug_3007() {
// https://github.com/meilisearch/meilisearch/issues/3007
use crate::error::{GeoError, UserError};
let index = TempIndex::new();
// Given is an index with a geo field NOT contained in the sortable_fields of the settings
index
.update_settings(|settings| {
settings.set_primary_key("id".to_string());
settings.set_filterable_fields(HashSet::from(["_geo".to_string()]));
})
.unwrap();
// happy path
index.add_documents(documents!({ "id" : 5, "_geo": {"lat": 12.0, "lng": 11.0}})).unwrap();
db_snap!(index, geo_faceted_documents_ids);
// both are unparseable, we expect GeoError::BadLatitudeAndLongitude
let err1 = index
.add_documents(
documents!({ "id" : 6, "_geo": {"lat": "unparseable", "lng": "unparseable"}}),
)
.unwrap_err();
assert!(matches!(
err1,
Error::UserError(UserError::InvalidGeoField(GeoError::BadLatitudeAndLongitude { .. }))
));
db_snap!(index, geo_faceted_documents_ids); // ensure that no more document was inserted
}
#[test]
fn unexpected_extra_fields_in_geo_field() {
let index = TempIndex::new();
index
.update_settings(|settings| {
settings.set_primary_key("id".to_string());
settings.set_filterable_fields(HashSet::from(["_geo".to_string()]));
})
.unwrap();
let err = index
.add_documents(
documents!({ "id" : "doggo", "_geo": { "lat": 1, "lng": 2, "doggo": "are the best" }}),
)
.unwrap_err();
insta::assert_display_snapshot!(err, @r###"The `_geo` field in the document with the id: `"\"doggo\""` contains the following unexpected fields: `{"doggo":"are the best"}`."###);
db_snap!(index, geo_faceted_documents_ids); // ensure that no documents were inserted
// multiple fields and complex values
let err = index
.add_documents(
documents!({ "id" : "doggo", "_geo": { "lat": 1, "lng": 2, "doggo": "are the best", "and": { "all": ["cats", { "are": "beautiful" } ] } } }),
)
.unwrap_err();
insta::assert_display_snapshot!(err, @r###"The `_geo` field in the document with the id: `"\"doggo\""` contains the following unexpected fields: `{"and":{"all":["cats",{"are":"beautiful"}]},"doggo":"are the best"}`."###);
db_snap!(index, geo_faceted_documents_ids); // ensure that no documents were inserted
}
}

View File

@ -183,14 +183,14 @@ impl<'t> Criterion for Proximity<'t> {
}
fn resolve_candidates(
ctx: &'_ dyn Context,
ctx: &dyn Context,
query_tree: &Operation,
proximity: u8,
cache: &mut Cache,
wdcache: &mut WordDerivationsCache,
) -> Result<RoaringBitmap> {
fn resolve_operation(
ctx: &'_ dyn Context,
ctx: &dyn Context,
query_tree: &Operation,
proximity: u8,
cache: &mut Cache,
@ -244,7 +244,7 @@ fn resolve_candidates(
}
fn mdfs_pair(
ctx: &'_ dyn Context,
ctx: &dyn Context,
left: &Operation,
right: &Operation,
proximity: u8,
@ -299,7 +299,7 @@ fn resolve_candidates(
}
fn mdfs(
ctx: &'_ dyn Context,
ctx: &dyn Context,
branches: &[Operation],
proximity: u8,
cache: &mut Cache,

View File

@ -240,14 +240,14 @@ fn alterate_query_tree(
}
fn resolve_candidates(
ctx: &'_ dyn Context,
ctx: &dyn Context,
query_tree: &Operation,
number_typos: u8,
cache: &mut HashMap<(Operation, u8), RoaringBitmap>,
wdcache: &mut WordDerivationsCache,
) -> Result<RoaringBitmap> {
fn resolve_operation(
ctx: &'_ dyn Context,
ctx: &dyn Context,
query_tree: &Operation,
number_typos: u8,
cache: &mut HashMap<(Operation, u8), RoaringBitmap>,
@ -277,7 +277,7 @@ fn resolve_candidates(
}
fn mdfs(
ctx: &'_ dyn Context,
ctx: &dyn Context,
branches: &[Operation],
mana: u8,
cache: &mut HashMap<(Operation, u8), RoaringBitmap>,

View File

@ -291,6 +291,7 @@ impl<'a> FacetDistribution<'a> {
if !invalid_fields.is_empty() {
return Err(UserError::InvalidFacetsDistribution {
invalid_facets_name: invalid_fields.into_iter().cloned().collect(),
valid_facets_name: filterable_fields.into_iter().collect(),
}
.into());
} else {

View File

@ -0,0 +1,4 @@
---
source: milli/src/index.rs
---
[0, ]

View File

@ -0,0 +1,4 @@
---
source: milli/src/index.rs
---
[]

View File

@ -575,8 +575,8 @@ fn remove_from_word_docids(
}
fn remove_docids_from_field_id_docid_facet_value(
index: &'_ Index,
wtxn: &'_ mut heed::RwTxn,
index: &Index,
wtxn: &mut heed::RwTxn,
facet_type: FacetType,
field_id: FieldId,
to_remove: &RoaringBitmap,

View File

@ -159,7 +159,7 @@ impl FacetsUpdateIncrementalInner {
/// See documentation of `insert_in_level`
fn insert_in_level_0(
&self,
txn: &'_ mut RwTxn,
txn: &mut RwTxn,
field_id: u16,
facet_value: &[u8],
docids: &RoaringBitmap,
@ -213,7 +213,7 @@ impl FacetsUpdateIncrementalInner {
/// of the parent node should be incremented.
fn insert_in_level(
&self,
txn: &'_ mut RwTxn,
txn: &mut RwTxn,
field_id: u16,
level: u8,
facet_value: &[u8],
@ -350,7 +350,7 @@ impl FacetsUpdateIncrementalInner {
/// Insert the given facet value and corresponding document ids in the database.
pub fn insert(
&self,
txn: &'_ mut RwTxn,
txn: &mut RwTxn,
field_id: u16,
facet_value: &[u8],
docids: &RoaringBitmap,
@ -472,7 +472,7 @@ impl FacetsUpdateIncrementalInner {
/// its left bound as well.
fn delete_in_level(
&self,
txn: &'_ mut RwTxn,
txn: &mut RwTxn,
field_id: u16,
level: u8,
facet_value: &[u8],
@ -531,7 +531,7 @@ impl FacetsUpdateIncrementalInner {
fn delete_in_level_0(
&self,
txn: &'_ mut RwTxn,
txn: &mut RwTxn,
field_id: u16,
facet_value: &[u8],
docids: &RoaringBitmap,
@ -559,7 +559,7 @@ impl FacetsUpdateIncrementalInner {
pub fn delete(
&self,
txn: &'_ mut RwTxn,
txn: &mut RwTxn,
field_id: u16,
facet_value: &[u8],
docids: &RoaringBitmap,

View File

@ -98,7 +98,12 @@ pub fn enrich_documents_batch<R: Read + Seek>(
// If the settings specifies that a _geo field must be used therefore we must check the
// validity of it in all the documents of this batch and this is when we return `Some`.
let geo_field_id = match documents_batch_index.id("_geo") {
Some(geo_field_id) if index.sortable_fields(rtxn)?.contains("_geo") => Some(geo_field_id),
Some(geo_field_id)
if index.sortable_fields(rtxn)?.contains("_geo")
|| index.filterable_fields(rtxn)?.contains("_geo") =>
{
Some(geo_field_id)
}
_otherwise => None,
};
@ -367,11 +372,17 @@ pub fn extract_finite_float_from_value(value: Value) -> StdResult<f64, Value> {
pub fn validate_geo_from_json(id: &DocumentId, bytes: &[u8]) -> Result<StdResult<(), GeoError>> {
use GeoError::*;
let debug_id = || Value::from(id.debug());
let debug_id = || {
serde_json::from_slice(id.value().as_bytes()).unwrap_or_else(|_| Value::from(id.debug()))
};
match serde_json::from_slice(bytes).map_err(InternalError::SerdeJson)? {
Value::Object(mut object) => match (object.remove("lat"), object.remove("lng")) {
(Some(lat), Some(lng)) => {
match (extract_finite_float_from_value(lat), extract_finite_float_from_value(lng)) {
(Ok(_), Ok(_)) if !object.is_empty() => Ok(Err(UnexpectedExtraFields {
document_id: debug_id(),
value: object.into(),
})),
(Ok(_), Ok(_)) => Ok(Ok(())),
(Err(value), Ok(_)) => Ok(Err(BadLatitude { document_id: debug_id(), value })),
(Ok(_), Err(value)) => Ok(Err(BadLongitude { document_id: debug_id(), value })),

View File

@ -965,34 +965,6 @@ mod tests {
.unwrap();
}
#[test]
fn index_all_flavour_of_geo() {
let mut index = TempIndex::new();
index.index_documents_config.update_method = IndexDocumentsMethod::ReplaceDocuments;
index
.update_settings(|settings| {
settings.set_filterable_fields(hashset!(S("_geo")));
})
.unwrap();
index
.add_documents(documents!([
{ "id": 0, "_geo": { "lat": 31, "lng": [42] } },
{ "id": 1, "_geo": { "lat": "31" }, "_geo.lng": 42 },
{ "id": 2, "_geo": { "lng": "42" }, "_geo.lat": "31" },
{ "id": 3, "_geo.lat": 31, "_geo.lng": "42" },
]))
.unwrap();
let rtxn = index.read_txn().unwrap();
let mut search = crate::Search::new(&rtxn, &index);
search.filter(crate::Filter::from_str("_geoRadius(31, 42, 0.000001)").unwrap().unwrap());
let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
assert_eq!(documents_ids, vec![0, 1, 2, 3]);
}
#[test]
fn geo_error() {
let mut index = TempIndex::new();

View File

@ -37,9 +37,6 @@ where
_ => T::deserialize_from_value(value, location).map(Setting::Set),
}
}
fn default() -> Option<Self> {
Some(Self::NotSet)
}
}
impl<T> Default for Setting<T> {

View File

@ -140,6 +140,10 @@ impl<'t, 'u, 'i> WordPrefixPositionDocids<'t, 'u, 'i> {
// We remove all the entries that are no more required in this word prefix position
// docids database.
// We also avoid iterating over the whole `word_prefix_position_docids` database if we know in
// advance that the `if del_prefix_fst_words.contains(prefix.as_bytes()) {` condition below
// will always be false (i.e. if `del_prefix_fst_words` is empty).
if !del_prefix_fst_words.is_empty() {
let mut iter =
self.index.word_prefix_position_docids.iter_mut(self.wtxn)?.lazily_decode_data();
while let Some(((prefix, _), _)) = iter.next().transpose()? {
@ -147,8 +151,8 @@ impl<'t, 'u, 'i> WordPrefixPositionDocids<'t, 'u, 'i> {
unsafe { iter.del_current()? };
}
}
drop(iter);
}
// We finally write all the word prefix position docids into the LMDB database.
sorter_into_lmdb_database(