diff --git a/filter-parser/src/condition.rs b/filter-parser/src/condition.rs index 735ffec0e..9abe1c6ea 100644 --- a/filter-parser/src/condition.rs +++ b/filter-parser/src/condition.rs @@ -20,6 +20,8 @@ pub enum Condition<'a> { GreaterThanOrEqual(Token<'a>), Equal(Token<'a>), NotEqual(Token<'a>), + Null, + Empty, Exists, LowerThan(Token<'a>), LowerThanOrEqual(Token<'a>), @@ -44,6 +46,38 @@ pub fn parse_condition(input: Span) -> IResult { Ok((input, condition)) } +/// null = value "IS" WS+ "NULL" +pub fn parse_is_null(input: Span) -> IResult { + let (input, key) = parse_value(input)?; + + let (input, _) = tuple((tag("IS"), multispace1, tag("NULL")))(input)?; + Ok((input, FilterCondition::Condition { fid: key, op: Null })) +} + +/// null = value "IS" WS+ "NOT" WS+ "NULL" +pub fn parse_is_not_null(input: Span) -> IResult { + let (input, key) = parse_value(input)?; + + let (input, _) = tuple((tag("IS"), multispace1, tag("NOT"), multispace1, tag("NULL")))(input)?; + Ok((input, FilterCondition::Not(Box::new(FilterCondition::Condition { fid: key, op: Null })))) +} + +/// empty = value "IS" WS+ "EMPTY" +pub fn parse_is_empty(input: Span) -> IResult { + let (input, key) = parse_value(input)?; + + let (input, _) = tuple((tag("IS"), multispace1, tag("EMPTY")))(input)?; + Ok((input, FilterCondition::Condition { fid: key, op: Empty })) +} + +/// empty = value "IS" WS+ "NOT" WS+ "EMPTY" +pub fn parse_is_not_empty(input: Span) -> IResult { + let (input, key) = parse_value(input)?; + + let (input, _) = tuple((tag("IS"), multispace1, tag("NOT"), multispace1, tag("EMPTY")))(input)?; + Ok((input, FilterCondition::Not(Box::new(FilterCondition::Condition { fid: key, op: Empty })))) +} + /// exist = value "EXISTS" pub fn parse_exists(input: Span) -> IResult { let (input, key) = terminated(parse_value, tag("EXISTS"))(input)?; diff --git a/filter-parser/src/error.rs b/filter-parser/src/error.rs index 6de9526fd..b9d14a271 100644 --- a/filter-parser/src/error.rs +++ b/filter-parser/src/error.rs @@ -143,11 +143,9 @@ impl<'a> Display for Error<'a> { ErrorKind::MissingClosingDelimiter(c) => { writeln!(f, "Expression `{}` is missing the following closing delimiter: `{}`.", escaped_input, c)? } - ErrorKind::InvalidPrimary if input.trim().is_empty() => { - writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `_geoRadius`, or `_geoBoundingBox` but instead got nothing.")? - } ErrorKind::InvalidPrimary => { - writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `_geoRadius`, or `_geoBoundingBox` at `{}`.", escaped_input)? + let text = if input.trim().is_empty() { "but instead got nothing.".to_string() } else { format!("at `{}`.", escaped_input) }; + writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` {}", text)? } ErrorKind::ExpectedEof => { writeln!(f, "Found unexpected characters at the end of the filter: `{}`. You probably forgot an `OR` or an `AND` rule.", escaped_input)? diff --git a/filter-parser/src/lib.rs b/filter-parser/src/lib.rs index 588953f70..0ae8a1532 100644 --- a/filter-parser/src/lib.rs +++ b/filter-parser/src/lib.rs @@ -47,7 +47,10 @@ mod value; use std::fmt::Debug; pub use condition::{parse_condition, parse_to, Condition}; -use condition::{parse_exists, parse_not_exists}; +use condition::{ + parse_exists, parse_is_empty, parse_is_not_empty, parse_is_not_null, parse_is_null, + parse_not_exists, +}; use error::{cut_with_err, ExpectedValueKind, NomErrorExt}; pub use error::{Error, ErrorKind}; use nom::branch::alt; @@ -442,6 +445,10 @@ fn parse_primary(input: Span, depth: usize) -> IResult { parse_in, parse_not_in, parse_condition, + parse_is_null, + parse_is_not_null, + parse_is_empty, + parse_is_not_empty, parse_exists, parse_not_exists, parse_to, @@ -526,14 +533,30 @@ pub mod tests { insta::assert_display_snapshot!(p("subscribers <= 1000"), @"{subscribers} <= {1000}"); insta::assert_display_snapshot!(p("subscribers 100 TO 1000"), @"{subscribers} {100} TO {1000}"); - // Test NOT + EXISTS - insta::assert_display_snapshot!(p("subscribers EXISTS"), @"{subscribers} EXISTS"); + // Test NOT insta::assert_display_snapshot!(p("NOT subscribers < 1000"), @"NOT ({subscribers} < {1000})"); + insta::assert_display_snapshot!(p("NOT subscribers 100 TO 1000"), @"NOT ({subscribers} {100} TO {1000})"); + + // Test NULL + NOT NULL + insta::assert_display_snapshot!(p("subscribers IS NULL"), @"{subscribers} IS NULL"); + insta::assert_display_snapshot!(p("NOT subscribers IS NULL"), @"NOT ({subscribers} IS NULL)"); + insta::assert_display_snapshot!(p("subscribers IS NOT NULL"), @"NOT ({subscribers} IS NULL)"); + insta::assert_display_snapshot!(p("NOT subscribers IS NOT NULL"), @"{subscribers} IS NULL"); + insta::assert_display_snapshot!(p("subscribers IS NOT NULL"), @"NOT ({subscribers} IS NULL)"); + + // Test EMPTY + NOT EMPTY + insta::assert_display_snapshot!(p("subscribers IS EMPTY"), @"{subscribers} IS EMPTY"); + insta::assert_display_snapshot!(p("NOT subscribers IS EMPTY"), @"NOT ({subscribers} IS EMPTY)"); + insta::assert_display_snapshot!(p("subscribers IS NOT EMPTY"), @"NOT ({subscribers} IS EMPTY)"); + insta::assert_display_snapshot!(p("NOT subscribers IS NOT EMPTY"), @"{subscribers} IS EMPTY"); + insta::assert_display_snapshot!(p("subscribers IS NOT EMPTY"), @"NOT ({subscribers} IS EMPTY)"); + + // Test EXISTS + NOT EXITS + insta::assert_display_snapshot!(p("subscribers EXISTS"), @"{subscribers} EXISTS"); insta::assert_display_snapshot!(p("NOT subscribers EXISTS"), @"NOT ({subscribers} EXISTS)"); insta::assert_display_snapshot!(p("subscribers NOT EXISTS"), @"NOT ({subscribers} EXISTS)"); insta::assert_display_snapshot!(p("NOT subscribers NOT EXISTS"), @"{subscribers} EXISTS"); insta::assert_display_snapshot!(p("subscribers NOT EXISTS"), @"NOT ({subscribers} EXISTS)"); - insta::assert_display_snapshot!(p("NOT subscribers 100 TO 1000"), @"NOT ({subscribers} {100} TO {1000})"); // Test nested NOT insta::assert_display_snapshot!(p("NOT NOT NOT NOT x = 5"), @"{x} = {5}"); @@ -606,7 +629,7 @@ pub mod tests { "###); insta::assert_display_snapshot!(p("'OR'"), @r###" - Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `_geoRadius`, or `_geoBoundingBox` at `\'OR\'`. + Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` at `\'OR\'`. 1:5 'OR' "###); @@ -616,12 +639,12 @@ pub mod tests { "###); insta::assert_display_snapshot!(p("channel Ponce"), @r###" - Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `_geoRadius`, or `_geoBoundingBox` at `channel Ponce`. + Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` at `channel Ponce`. 1:14 channel Ponce "###); insta::assert_display_snapshot!(p("channel = Ponce OR"), @r###" - Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `_geoRadius`, or `_geoBoundingBox` but instead got nothing. + Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` but instead got nothing. 19:19 channel = Ponce OR "###); @@ -706,12 +729,12 @@ pub mod tests { "###); insta::assert_display_snapshot!(p("colour NOT EXIST"), @r###" - Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `_geoRadius`, or `_geoBoundingBox` at `colour NOT EXIST`. + Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` at `colour NOT EXIST`. 1:17 colour NOT EXIST "###); insta::assert_display_snapshot!(p("subscribers 100 TO1000"), @r###" - Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `_geoRadius`, or `_geoBoundingBox` at `subscribers 100 TO1000`. + Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` at `subscribers 100 TO1000`. 1:23 subscribers 100 TO1000 "###); @@ -772,6 +795,39 @@ pub mod tests { Was expecting a value but instead got `OR`, which is a reserved keyword. To use `OR` as a field name or a value, surround it by quotes. 5:7 NOT OR EXISTS AND EXISTS NOT EXISTS "###); + + insta::assert_display_snapshot!(p(r#"value NULL"#), @r###" + Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` at `value NULL`. + 1:11 value NULL + "###); + insta::assert_display_snapshot!(p(r#"value NOT NULL"#), @r###" + Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` at `value NOT NULL`. + 1:15 value NOT NULL + "###); + insta::assert_display_snapshot!(p(r#"value EMPTY"#), @r###" + Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` at `value EMPTY`. + 1:12 value EMPTY + "###); + insta::assert_display_snapshot!(p(r#"value NOT EMPTY"#), @r###" + Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` at `value NOT EMPTY`. + 1:16 value NOT EMPTY + "###); + insta::assert_display_snapshot!(p(r#"value IS"#), @r###" + Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` at `value IS`. + 1:9 value IS + "###); + insta::assert_display_snapshot!(p(r#"value IS NOT"#), @r###" + Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` at `value IS NOT`. + 1:13 value IS NOT + "###); + insta::assert_display_snapshot!(p(r#"value IS EXISTS"#), @r###" + Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` at `value IS EXISTS`. + 1:16 value IS EXISTS + "###); + insta::assert_display_snapshot!(p(r#"value IS NOT EXISTS"#), @r###" + Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` at `value IS NOT EXISTS`. + 1:20 value IS NOT EXISTS + "###); } #[test] @@ -853,6 +909,8 @@ impl<'a> std::fmt::Display for Condition<'a> { Condition::GreaterThanOrEqual(token) => write!(f, ">= {token}"), Condition::Equal(token) => write!(f, "= {token}"), Condition::NotEqual(token) => write!(f, "!= {token}"), + Condition::Null => write!(f, "IS NULL"), + Condition::Empty => write!(f, "IS EMPTY"), Condition::Exists => write!(f, "EXISTS"), Condition::LowerThan(token) => write!(f, "< {token}"), Condition::LowerThanOrEqual(token) => write!(f, "<= {token}"), diff --git a/filter-parser/src/value.rs b/filter-parser/src/value.rs index fb8f35503..735352fc3 100644 --- a/filter-parser/src/value.rs +++ b/filter-parser/src/value.rs @@ -183,7 +183,20 @@ fn is_syntax_component(c: char) -> bool { } fn is_keyword(s: &str) -> bool { - matches!(s, "AND" | "OR" | "IN" | "NOT" | "TO" | "EXISTS" | "_geoRadius" | "_geoBoundingBox") + matches!( + s, + "AND" + | "OR" + | "IN" + | "NOT" + | "TO" + | "EXISTS" + | "IS" + | "NULL" + | "EMPTY" + | "_geoRadius" + | "_geoBoundingBox" + ) } #[cfg(test)] diff --git a/flatten-serde-json/src/lib.rs b/flatten-serde-json/src/lib.rs index e1b2b20c7..c3346e15e 100644 --- a/flatten-serde-json/src/lib.rs +++ b/flatten-serde-json/src/lib.rs @@ -4,51 +4,56 @@ use serde_json::{Map, Value}; pub fn flatten(json: &Map) -> Map { let mut obj = Map::new(); - let mut all_keys = vec![]; - insert_object(&mut obj, None, json, &mut all_keys); - for key in all_keys { - obj.entry(key).or_insert(Value::Array(vec![])); + let mut all_entries = vec![]; + insert_object(&mut obj, None, json, &mut all_entries); + for (key, old_val) in all_entries { + obj.entry(key).or_insert(old_val.clone()); } obj } -fn insert_object( +fn insert_object<'a>( base_json: &mut Map, base_key: Option<&str>, - object: &Map, - all_keys: &mut Vec, + object: &'a Map, + all_entries: &mut Vec<(String, &'a Value)>, ) { for (key, value) in object { let new_key = base_key.map_or_else(|| key.clone(), |base_key| format!("{base_key}.{key}")); - all_keys.push(new_key.clone()); + all_entries.push((new_key.clone(), value)); if let Some(array) = value.as_array() { - insert_array(base_json, &new_key, array, all_keys); + insert_array(base_json, &new_key, array, all_entries); } else if let Some(object) = value.as_object() { - insert_object(base_json, Some(&new_key), object, all_keys); + insert_object(base_json, Some(&new_key), object, all_entries); } else { - insert_value(base_json, &new_key, value.clone()); + insert_value(base_json, &new_key, value.clone(), false); } } } -fn insert_array( +fn insert_array<'a>( base_json: &mut Map, base_key: &str, - array: &Vec, - all_keys: &mut Vec, + array: &'a Vec, + all_entries: &mut Vec<(String, &'a Value)>, ) { for value in array { if let Some(object) = value.as_object() { - insert_object(base_json, Some(base_key), object, all_keys); + insert_object(base_json, Some(base_key), object, all_entries); } else if let Some(sub_array) = value.as_array() { - insert_array(base_json, base_key, sub_array, all_keys); + insert_array(base_json, base_key, sub_array, all_entries); } else { - insert_value(base_json, base_key, value.clone()); + insert_value(base_json, base_key, value.clone(), true); } } } -fn insert_value(base_json: &mut Map, key: &str, to_insert: Value) { +fn insert_value( + base_json: &mut Map, + key: &str, + to_insert: Value, + came_from_array: bool, +) { debug_assert!(!to_insert.is_object()); debug_assert!(!to_insert.is_array()); @@ -63,6 +68,8 @@ fn insert_value(base_json: &mut Map, key: &str, to_insert: Value) base_json[key] = Value::Array(vec![value, to_insert]); } // if it does not exist we can push the value untouched + } else if came_from_array { + base_json.insert(key.to_string(), Value::Array(vec![to_insert])); } else { base_json.insert(key.to_string(), to_insert); } @@ -113,7 +120,11 @@ mod tests { assert_eq!( &flat, json!({ - "a": [], + "a": { + "b": "c", + "d": "e", + "f": "g" + }, "a.b": "c", "a.d": "e", "a.f": "g" @@ -164,7 +175,7 @@ mod tests { assert_eq!( &flat, json!({ - "a": 42, + "a": [42], "a.b": ["c", "d", "e"], }) .as_object() @@ -186,7 +197,7 @@ mod tests { assert_eq!( &flat, json!({ - "a": null, + "a": [null], "a.b": ["c", "d", "e"], }) .as_object() @@ -208,7 +219,9 @@ mod tests { assert_eq!( &flat, json!({ - "a": [], + "a": { + "b": "c" + }, "a.b": ["c", "d"], }) .as_object() @@ -234,7 +247,7 @@ mod tests { json!({ "a.b": ["c", "d", "f"], "a.c": "e", - "a": 35, + "a": [35], }) .as_object() .unwrap() @@ -302,4 +315,53 @@ mod tests { .unwrap() ); } + + #[test] + fn flatten_nested_values_keep_original_values() { + let mut base: Value = json!({ + "tags": { + "t1": "v1" + }, + "prices": { + "p1": [null], + "p1000": {"tamo": {"le": {}}} + }, + "kiki": [[]] + }); + let json = std::mem::take(base.as_object_mut().unwrap()); + let flat = flatten(&json); + + println!("{}", serde_json::to_string_pretty(&flat).unwrap()); + + assert_eq!( + &flat, + json!({ + "prices": { + "p1": [null], + "p1000": { + "tamo": { + "le": {} + } + } + }, + "prices.p1": [null], + "prices.p1000": { + "tamo": { + "le": {} + } + }, + "prices.p1000.tamo": { + "le": {} + }, + "prices.p1000.tamo.le": {}, + "tags": { + "t1": "v1" + }, + "tags.t1": "v1", + "kiki": [[]] + }) + .as_object() + .unwrap() + ); + } } diff --git a/meilisearch/tests/search/errors.rs b/meilisearch/tests/search/errors.rs index 0aaf87773..a9a2969bb 100644 --- a/meilisearch/tests/search/errors.rs +++ b/meilisearch/tests/search/errors.rs @@ -547,7 +547,7 @@ async fn filter_invalid_syntax_object() { index.wait_task(1).await; let expected_response = json!({ - "message": "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `_geoRadius`, or `_geoBoundingBox` at `title & Glass`.\n1:14 title & Glass", + "message": "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` at `title & Glass`.\n1:14 title & Glass", "code": "invalid_search_filter", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_search_filter" @@ -572,7 +572,7 @@ async fn filter_invalid_syntax_array() { index.wait_task(1).await; let expected_response = json!({ - "message": "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `_geoRadius`, or `_geoBoundingBox` at `title & Glass`.\n1:14 title & Glass", + "message": "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` at `title & Glass`.\n1:14 title & Glass", "code": "invalid_search_filter", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_search_filter" diff --git a/milli/src/index.rs b/milli/src/index.rs index acb2a0a9d..d22f788c6 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -80,6 +80,8 @@ pub mod db_name { pub const FIELD_ID_WORD_COUNT_DOCIDS: &str = "field-id-word-count-docids"; pub const FACET_ID_F64_DOCIDS: &str = "facet-id-f64-docids"; pub const FACET_ID_EXISTS_DOCIDS: &str = "facet-id-exists-docids"; + pub const FACET_ID_IS_NULL_DOCIDS: &str = "facet-id-is-null-docids"; + pub const FACET_ID_IS_EMPTY_DOCIDS: &str = "facet-id-is-empty-docids"; pub const FACET_ID_STRING_DOCIDS: &str = "facet-id-string-docids"; pub const FIELD_ID_DOCID_FACET_F64S: &str = "field-id-docid-facet-f64s"; pub const FIELD_ID_DOCID_FACET_STRINGS: &str = "field-id-docid-facet-strings"; @@ -129,6 +131,10 @@ pub struct Index { /// Maps the facet field id and the docids for which this field exists pub facet_id_exists_docids: Database, + /// Maps the facet field id and the docids for which this field is set as null + pub facet_id_is_null_docids: Database, + /// Maps the facet field id and the docids for which this field is considered empty + pub facet_id_is_empty_docids: Database, /// Maps the facet field id and ranges of numbers with the docids that corresponds to them. pub facet_id_f64_docids: Database, FacetGroupValueCodec>, @@ -153,7 +159,7 @@ impl Index { ) -> Result { use db_name::*; - options.max_dbs(19); + options.max_dbs(21); unsafe { options.flag(Flags::MdbAlwaysFreePages) }; let env = options.open(path)?; @@ -175,6 +181,8 @@ impl Index { let facet_id_f64_docids = env.create_database(Some(FACET_ID_F64_DOCIDS))?; let facet_id_string_docids = env.create_database(Some(FACET_ID_STRING_DOCIDS))?; let facet_id_exists_docids = env.create_database(Some(FACET_ID_EXISTS_DOCIDS))?; + let facet_id_is_null_docids = env.create_database(Some(FACET_ID_IS_NULL_DOCIDS))?; + let facet_id_is_empty_docids = env.create_database(Some(FACET_ID_IS_EMPTY_DOCIDS))?; let field_id_docid_facet_f64s = env.create_database(Some(FIELD_ID_DOCID_FACET_F64S))?; let field_id_docid_facet_strings = @@ -201,6 +209,8 @@ impl Index { facet_id_f64_docids, facet_id_string_docids, facet_id_exists_docids, + facet_id_is_null_docids, + facet_id_is_empty_docids, field_id_docid_facet_f64s, field_id_docid_facet_strings, documents, @@ -833,6 +843,30 @@ impl Index { } } + /// Retrieve all the documents which contain this field id set as null + pub fn null_faceted_documents_ids( + &self, + rtxn: &RoTxn, + field_id: FieldId, + ) -> heed::Result { + match self.facet_id_is_null_docids.get(rtxn, &BEU16::new(field_id))? { + Some(docids) => Ok(docids), + None => Ok(RoaringBitmap::new()), + } + } + + /// Retrieve all the documents which contain this field id and that is considered empty + pub fn empty_faceted_documents_ids( + &self, + rtxn: &RoTxn, + field_id: FieldId, + ) -> heed::Result { + match self.facet_id_is_empty_docids.get(rtxn, &BEU16::new(field_id))? { + Some(docids) => Ok(docids), + None => Ok(RoaringBitmap::new()), + } + } + /// Retrieve all the documents which contain this field id pub fn exists_faceted_documents_ids( &self, diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index 9a4aba9ba..19d763107 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -211,6 +211,14 @@ impl<'a> Filter<'a> { Condition::Between { from, to } => { (Included(from.parse_finite_float()?), Included(to.parse_finite_float()?)) } + Condition::Null => { + let is_null = index.null_faceted_documents_ids(rtxn, field_id)?; + return Ok(is_null); + } + Condition::Empty => { + let is_empty = index.empty_faceted_documents_ids(rtxn, field_id)?; + return Ok(is_empty); + } Condition::Exists => { let exist = index.exists_faceted_documents_ids(rtxn, field_id)?; return Ok(exist); diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs index f7f1a97e6..85d1bc626 100644 --- a/milli/src/snapshot_tests.rs +++ b/milli/src/snapshot_tests.rs @@ -271,6 +271,16 @@ pub fn snap_facet_id_exists_docids(index: &Index) -> String { &format!("{facet_id:<3} {}", display_bitmap(&docids)) }) } +pub fn snap_facet_id_is_null_docids(index: &Index) -> String { + make_db_snap_from_iter!(index, facet_id_is_null_docids, |(facet_id, docids)| { + &format!("{facet_id:<3} {}", display_bitmap(&docids)) + }) +} +pub fn snap_facet_id_is_empty_docids(index: &Index) -> String { + make_db_snap_from_iter!(index, facet_id_is_empty_docids, |(facet_id, docids)| { + &format!("{facet_id:<3} {}", display_bitmap(&docids)) + }) +} pub fn snap_facet_id_string_docids(index: &Index) -> String { make_db_snap_from_iter!(index, facet_id_string_docids, |( FacetGroupKey { field_id, level, left_bound }, @@ -495,6 +505,12 @@ macro_rules! full_snap_of_db { ($index:ident, facet_id_exists_docids) => {{ $crate::snapshot_tests::snap_facet_id_exists_docids(&$index) }}; + ($index:ident, facet_id_is_null_docids) => {{ + $crate::snapshot_tests::snap_facet_id_is_null_docids(&$index) + }}; + ($index:ident, facet_id_is_empty_docids) => {{ + $crate::snapshot_tests::snap_facet_id_is_empty_docids(&$index) + }}; ($index:ident, documents_ids) => {{ $crate::snapshot_tests::snap_documents_ids(&$index) }}; diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index 0296bc192..326e0825d 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -34,6 +34,8 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { facet_id_f64_docids, facet_id_string_docids, facet_id_exists_docids, + facet_id_is_null_docids, + facet_id_is_empty_docids, field_id_docid_facet_f64s, field_id_docid_facet_strings, documents, @@ -86,6 +88,8 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { script_language_docids.clear(self.wtxn)?; facet_id_f64_docids.clear(self.wtxn)?; facet_id_exists_docids.clear(self.wtxn)?; + facet_id_is_null_docids.clear(self.wtxn)?; + facet_id_is_empty_docids.clear(self.wtxn)?; facet_id_string_docids.clear(self.wtxn)?; field_id_docid_facet_f64s.clear(self.wtxn)?; field_id_docid_facet_strings.clear(self.wtxn)?; diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index eeb67b829..6f2fa5e5a 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -245,6 +245,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { field_id_docid_facet_strings: _, script_language_docids, facet_id_exists_docids, + facet_id_is_null_docids, + facet_id_is_empty_docids, documents, } = self.index; @@ -517,12 +519,26 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { drop(iter); // We delete the documents ids that are under the facet field id values. - remove_docids_from_facet_id_exists_docids( + remove_docids_from_facet_id_docids( self.wtxn, facet_id_exists_docids, &self.to_delete_docids, )?; + // We delete the documents ids that are under the facet field id values. + remove_docids_from_facet_id_docids( + self.wtxn, + facet_id_is_null_docids, + &self.to_delete_docids, + )?; + + // We delete the documents ids that are under the facet field id values. + remove_docids_from_facet_id_docids( + self.wtxn, + facet_id_is_empty_docids, + &self.to_delete_docids, + )?; + self.index.put_soft_deleted_documents_ids(self.wtxn, &RoaringBitmap::new())?; Ok(DetailedDocumentDeletionResult { @@ -625,7 +641,7 @@ fn remove_docids_from_field_id_docid_facet_value( Ok(all_affected_facet_values) } -fn remove_docids_from_facet_id_exists_docids<'a, C>( +fn remove_docids_from_facet_id_docids<'a, C>( wtxn: &'a mut heed::RwTxn, db: &heed::Database, to_remove: &RoaringBitmap, diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index 131b78df9..bcc4f26ec 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -181,7 +181,7 @@ fn json_to_string<'a>(value: &'a Value, buffer: &'a mut String) -> Option<&'a st fn inner(value: &Value, output: &mut String) -> bool { use std::fmt::Write; match value { - Value::Null => false, + Value::Null | Value::Object(_) => false, Value::Bool(boolean) => write!(output, "{}", boolean).is_ok(), Value::Number(number) => write!(output, "{}", number).is_ok(), Value::String(string) => write!(output, "{}", string).is_ok(), @@ -196,23 +196,6 @@ fn json_to_string<'a>(value: &'a Value, buffer: &'a mut String) -> Option<&'a st // check that at least one value was written count != 0 } - Value::Object(object) => { - let mut buffer = String::new(); - let mut count = 0; - for (key, value) in object { - buffer.clear(); - let _ = write!(&mut buffer, "{}: ", key); - if inner(value, &mut buffer) { - buffer.push_str(". "); - // We write the "key: value. " pair only when - // we are sure that the value can be written. - output.push_str(&buffer); - count += 1; - } - } - // check that at least one value was written - count != 0 - } } } diff --git a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs index f0bd78792..77a5561fe 100644 --- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs +++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs @@ -7,7 +7,7 @@ use std::mem::size_of; use heed::zerocopy::AsBytes; use heed::BytesEncode; use roaring::RoaringBitmap; -use serde_json::Value; +use serde_json::{from_slice, Value}; use super::helpers::{create_sorter, keep_first, sorter_into_reader, GrenadParameters}; use crate::error::InternalError; @@ -15,6 +15,15 @@ use crate::facet::value_encoding::f64_into_bytes; use crate::update::index_documents::{create_writer, writer_into_reader}; use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result, BEU32, MAX_FACET_VALUE_LENGTH}; +/// The extracted facet values stored in grenad files by type. +pub struct ExtractedFacetValues { + pub docid_fid_facet_numbers_chunk: grenad::Reader, + pub docid_fid_facet_strings_chunk: grenad::Reader, + pub fid_facet_is_null_docids_chunk: grenad::Reader, + pub fid_facet_is_empty_docids_chunk: grenad::Reader, + pub fid_facet_exists_docids_chunk: grenad::Reader, +} + /// Extracts the facet values of each faceted field of each document. /// /// Returns the generated grenad reader containing the docid the fid and the orginal value as key @@ -24,7 +33,7 @@ pub fn extract_fid_docid_facet_values( obkv_documents: grenad::Reader, indexer: GrenadParameters, faceted_fields: &HashSet, -) -> Result<(grenad::Reader, grenad::Reader, grenad::Reader)> { +) -> Result { let max_memory = indexer.max_memory_by_thread(); let mut fid_docid_facet_numbers_sorter = create_sorter( @@ -46,6 +55,8 @@ pub fn extract_fid_docid_facet_values( ); let mut facet_exists_docids = BTreeMap::::new(); + let mut facet_is_null_docids = BTreeMap::::new(); + let mut facet_is_empty_docids = BTreeMap::::new(); let mut key_buffer = Vec::new(); let mut cursor = obkv_documents.into_cursor()?; @@ -69,33 +80,44 @@ pub fn extract_fid_docid_facet_values( // For the other extraction tasks, prefix the key with the field_id and the document_id key_buffer.extend_from_slice(docid_bytes); - let value = - serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?; + let value = from_slice(field_bytes).map_err(InternalError::SerdeJson)?; - let (numbers, strings) = extract_facet_values(&value); - - // insert facet numbers in sorter - for number in numbers { - key_buffer.truncate(size_of::() + size_of::()); - if let Some(value_bytes) = f64_into_bytes(number) { - key_buffer.extend_from_slice(&value_bytes); - key_buffer.extend_from_slice(&number.to_be_bytes()); - - fid_docid_facet_numbers_sorter.insert(&key_buffer, ().as_bytes())?; + match extract_facet_values(&value) { + FilterableValues::Null => { + facet_is_null_docids.entry(field_id).or_default().insert(document); } - } + FilterableValues::Empty => { + facet_is_empty_docids.entry(field_id).or_default().insert(document); + } + FilterableValues::Values { numbers, strings } => { + // insert facet numbers in sorter + for number in numbers { + key_buffer.truncate(size_of::() + size_of::()); + if let Some(value_bytes) = f64_into_bytes(number) { + key_buffer.extend_from_slice(&value_bytes); + key_buffer.extend_from_slice(&number.to_be_bytes()); - // insert normalized and original facet string in sorter - for (normalized, original) in strings.into_iter().filter(|(n, _)| !n.is_empty()) { - let normalised_truncated_value: String = normalized - .char_indices() - .take_while(|(idx, _)| idx + 4 < MAX_FACET_VALUE_LENGTH) - .map(|(_, c)| c) - .collect(); + fid_docid_facet_numbers_sorter + .insert(&key_buffer, ().as_bytes())?; + } + } - key_buffer.truncate(size_of::() + size_of::()); - key_buffer.extend_from_slice(normalised_truncated_value.as_bytes()); - fid_docid_facet_strings_sorter.insert(&key_buffer, original.as_bytes())?; + // insert normalized and original facet string in sorter + for (normalized, original) in + strings.into_iter().filter(|(n, _)| !n.is_empty()) + { + let normalized_truncated_value: String = normalized + .char_indices() + .take_while(|(idx, _)| idx + 4 < MAX_FACET_VALUE_LENGTH) + .map(|(_, c)| c) + .collect(); + + key_buffer.truncate(size_of::() + size_of::()); + key_buffer.extend_from_slice(normalized_truncated_value.as_bytes()); + fid_docid_facet_strings_sorter + .insert(&key_buffer, original.as_bytes())?; + } + } } } } @@ -112,14 +134,48 @@ pub fn extract_fid_docid_facet_values( } let facet_exists_docids_reader = writer_into_reader(facet_exists_docids_writer)?; - Ok(( - sorter_into_reader(fid_docid_facet_numbers_sorter, indexer)?, - sorter_into_reader(fid_docid_facet_strings_sorter, indexer)?, - facet_exists_docids_reader, - )) + let mut facet_is_null_docids_writer = create_writer( + indexer.chunk_compression_type, + indexer.chunk_compression_level, + tempfile::tempfile()?, + ); + for (fid, bitmap) in facet_is_null_docids.into_iter() { + let bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&bitmap).unwrap(); + facet_is_null_docids_writer.insert(fid.to_be_bytes(), &bitmap_bytes)?; + } + let facet_is_null_docids_reader = writer_into_reader(facet_is_null_docids_writer)?; + + let mut facet_is_empty_docids_writer = create_writer( + indexer.chunk_compression_type, + indexer.chunk_compression_level, + tempfile::tempfile()?, + ); + for (fid, bitmap) in facet_is_empty_docids.into_iter() { + let bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&bitmap).unwrap(); + facet_is_empty_docids_writer.insert(fid.to_be_bytes(), &bitmap_bytes)?; + } + let facet_is_empty_docids_reader = writer_into_reader(facet_is_empty_docids_writer)?; + + Ok(ExtractedFacetValues { + docid_fid_facet_numbers_chunk: sorter_into_reader(fid_docid_facet_numbers_sorter, indexer)?, + docid_fid_facet_strings_chunk: sorter_into_reader(fid_docid_facet_strings_sorter, indexer)?, + fid_facet_is_null_docids_chunk: facet_is_null_docids_reader, + fid_facet_is_empty_docids_chunk: facet_is_empty_docids_reader, + fid_facet_exists_docids_chunk: facet_exists_docids_reader, + }) } -fn extract_facet_values(value: &Value) -> (Vec, Vec<(String, String)>) { +/// Represent what a document field contains. +enum FilterableValues { + /// Corresponds to the JSON `null` value. + Null, + /// Corresponds to either, an empty string `""`, an empty array `[]`, or an empty object `{}`. + Empty, + /// Represents all the numbers and strings values found in this document field. + Values { numbers: Vec, strings: Vec<(String, String)> }, +} + +fn extract_facet_values(value: &Value) -> FilterableValues { fn inner_extract_facet_values( value: &Value, can_recurse: bool, @@ -149,9 +205,16 @@ fn extract_facet_values(value: &Value) -> (Vec, Vec<(String, String)>) { } } - let mut facet_number_values = Vec::new(); - let mut facet_string_values = Vec::new(); - inner_extract_facet_values(value, true, &mut facet_number_values, &mut facet_string_values); - - (facet_number_values, facet_string_values) + match value { + Value::Null => FilterableValues::Null, + Value::String(s) if s.is_empty() => FilterableValues::Empty, + Value::Array(a) if a.is_empty() => FilterableValues::Empty, + Value::Object(o) if o.is_empty() => FilterableValues::Empty, + otherwise => { + let mut numbers = Vec::new(); + let mut strings = Vec::new(); + inner_extract_facet_values(otherwise, true, &mut numbers, &mut strings); + FilterableValues::Values { numbers, strings } + } + } } diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index c0f07cf79..641a8a210 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -18,7 +18,7 @@ use rayon::prelude::*; use self::extract_docid_word_positions::extract_docid_word_positions; use self::extract_facet_number_docids::extract_facet_number_docids; use self::extract_facet_string_docids::extract_facet_string_docids; -use self::extract_fid_docid_facet_values::extract_fid_docid_facet_values; +use self::extract_fid_docid_facet_values::{extract_fid_docid_facet_values, ExtractedFacetValues}; use self::extract_fid_word_count_docids::extract_fid_word_count_docids; use self::extract_geo_points::extract_geo_points; use self::extract_word_docids::extract_word_docids; @@ -55,28 +55,35 @@ pub(crate) fn data_from_obkv_documents( .collect::>()?; #[allow(clippy::type_complexity)] - let result: Result<(Vec<_>, (Vec<_>, (Vec<_>, Vec<_>)))> = flattened_obkv_chunks - .par_bridge() - .map(|flattened_obkv_chunks| { - send_and_extract_flattened_documents_data( - flattened_obkv_chunks, - indexer, - lmdb_writer_sx.clone(), - &searchable_fields, - &faceted_fields, - primary_key_id, - geo_fields_ids, - &stop_words, - max_positions_per_attributes, - ) - }) - .collect(); + let result: Result<(Vec<_>, (Vec<_>, (Vec<_>, (Vec<_>, (Vec<_>, Vec<_>)))))> = + flattened_obkv_chunks + .par_bridge() + .map(|flattened_obkv_chunks| { + send_and_extract_flattened_documents_data( + flattened_obkv_chunks, + indexer, + lmdb_writer_sx.clone(), + &searchable_fields, + &faceted_fields, + primary_key_id, + geo_fields_ids, + &stop_words, + max_positions_per_attributes, + ) + }) + .collect(); let ( docid_word_positions_chunks, ( docid_fid_facet_numbers_chunks, - (docid_fid_facet_strings_chunks, facet_exists_docids_chunks), + ( + docid_fid_facet_strings_chunks, + ( + facet_is_null_docids_chunks, + (facet_is_empty_docids_chunks, facet_exists_docids_chunks), + ), + ), ), ) = result?; @@ -96,6 +103,38 @@ pub(crate) fn data_from_obkv_documents( }); } + // merge facet_is_null_docids and send them as a typed chunk + { + let lmdb_writer_sx = lmdb_writer_sx.clone(); + rayon::spawn(move || { + debug!("merge {} database", "facet-id-is-null-docids"); + match facet_is_null_docids_chunks.merge(merge_cbo_roaring_bitmaps, &indexer) { + Ok(reader) => { + let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetIsNullDocids(reader))); + } + Err(e) => { + let _ = lmdb_writer_sx.send(Err(e)); + } + } + }); + } + + // merge facet_is_empty_docids and send them as a typed chunk + { + let lmdb_writer_sx = lmdb_writer_sx.clone(); + rayon::spawn(move || { + debug!("merge {} database", "facet-id-is-empty-docids"); + match facet_is_empty_docids_chunks.merge(merge_cbo_roaring_bitmaps, &indexer) { + Ok(reader) => { + let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetIsEmptyDocids(reader))); + } + Err(e) => { + let _ = lmdb_writer_sx.send(Err(e)); + } + } + }); + } + spawn_extraction_task::<_, _, Vec>>( docid_word_positions_chunks.clone(), indexer, @@ -235,7 +274,10 @@ fn send_and_extract_flattened_documents_data( grenad::Reader, ( grenad::Reader, - (grenad::Reader, grenad::Reader), + ( + grenad::Reader, + (grenad::Reader, (grenad::Reader, grenad::Reader)), + ), ), )> { let flattened_documents_chunk = @@ -281,11 +323,13 @@ fn send_and_extract_flattened_documents_data( Ok(docid_word_positions_chunk) }, || { - let ( + let ExtractedFacetValues { docid_fid_facet_numbers_chunk, docid_fid_facet_strings_chunk, + fid_facet_is_null_docids_chunk, + fid_facet_is_empty_docids_chunk, fid_facet_exists_docids_chunk, - ) = extract_fid_docid_facet_values( + } = extract_fid_docid_facet_values( flattened_documents_chunk.clone(), indexer, faceted_fields, @@ -309,7 +353,13 @@ fn send_and_extract_flattened_documents_data( Ok(( docid_fid_facet_numbers_chunk, - (docid_fid_facet_strings_chunk, fid_facet_exists_docids_chunk), + ( + docid_fid_facet_strings_chunk, + ( + fid_facet_is_null_docids_chunk, + (fid_facet_is_empty_docids_chunk, fid_facet_exists_docids_chunk), + ), + ), )) }, ); diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index ade217beb..ff02b9db2 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -1757,6 +1757,187 @@ mod tests { check_ok(&index); } + #[test] + fn index_documents_check_is_null_database() { + let content = || { + documents!([ + { + "id": 0, + "colour": null, + }, + { + "id": 1, + "colour": [null], // must not be returned + }, + { + "id": 6, + "colour": { + "green": null + } + }, + { + "id": 7, + "colour": { + "green": { + "blue": null + } + } + }, + { + "id": 8, + "colour": 0, + }, + { + "id": 9, + "colour": [] + }, + { + "id": 10, + "colour": {} + }, + { + "id": 12, + "colour": [1] + }, + { + "id": 13 + }, + { + "id": 14, + "colour": { + "green": 1 + } + }, + { + "id": 15, + "colour": { + "green": { + "blue": [] + } + } + } + ]) + }; + + let check_ok = |index: &Index| { + let rtxn = index.read_txn().unwrap(); + let facets = index.faceted_fields(&rtxn).unwrap(); + assert_eq!(facets, hashset!(S("colour"), S("colour.green"), S("colour.green.blue"))); + + let colour_id = index.fields_ids_map(&rtxn).unwrap().id("colour").unwrap(); + let colour_green_id = index.fields_ids_map(&rtxn).unwrap().id("colour.green").unwrap(); + let colour_blue_id = + index.fields_ids_map(&rtxn).unwrap().id("colour.green.blue").unwrap(); + + let bitmap_null_colour = + index.facet_id_is_null_docids.get(&rtxn, &BEU16::new(colour_id)).unwrap().unwrap(); + assert_eq!(bitmap_null_colour.into_iter().collect::>(), vec![0]); + + let bitmap_colour_green = index + .facet_id_is_null_docids + .get(&rtxn, &BEU16::new(colour_green_id)) + .unwrap() + .unwrap(); + assert_eq!(bitmap_colour_green.into_iter().collect::>(), vec![2]); + + let bitmap_colour_blue = index + .facet_id_is_null_docids + .get(&rtxn, &BEU16::new(colour_blue_id)) + .unwrap() + .unwrap(); + assert_eq!(bitmap_colour_blue.into_iter().collect::>(), vec![3]); + }; + + let faceted_fields = hashset!(S("colour")); + + let index = TempIndex::new(); + index.add_documents(content()).unwrap(); + index + .update_settings(|settings| { + settings.set_filterable_fields(faceted_fields.clone()); + }) + .unwrap(); + check_ok(&index); + + let index = TempIndex::new(); + index + .update_settings(|settings| { + settings.set_filterable_fields(faceted_fields.clone()); + }) + .unwrap(); + index.add_documents(content()).unwrap(); + check_ok(&index); + } + + #[test] + fn index_documents_check_is_empty_database() { + let content = || { + documents!([ + {"id": 0, "tags": null }, + {"id": 1, "tags": [null] }, + {"id": 2, "tags": [] }, + {"id": 3, "tags": ["hello","world"] }, + {"id": 4, "tags": [""] }, + {"id": 5 }, + {"id": 6, "tags": {} }, + {"id": 7, "tags": {"green": "cool"} }, + {"id": 8, "tags": {"green": ""} }, + {"id": 9, "tags": "" }, + {"id": 10, "tags": { "green": null } }, + {"id": 11, "tags": { "green": { "blue": null } } }, + {"id": 12, "tags": { "green": { "blue": [] } } } + ]) + }; + + let check_ok = |index: &Index| { + let rtxn = index.read_txn().unwrap(); + let facets = index.faceted_fields(&rtxn).unwrap(); + assert_eq!(facets, hashset!(S("tags"), S("tags.green"), S("tags.green.blue"))); + + let tags_id = index.fields_ids_map(&rtxn).unwrap().id("tags").unwrap(); + let tags_green_id = index.fields_ids_map(&rtxn).unwrap().id("tags.green").unwrap(); + let tags_blue_id = index.fields_ids_map(&rtxn).unwrap().id("tags.green.blue").unwrap(); + + let bitmap_empty_tags = + index.facet_id_is_empty_docids.get(&rtxn, &BEU16::new(tags_id)).unwrap().unwrap(); + assert_eq!(bitmap_empty_tags.into_iter().collect::>(), vec![2, 6, 9]); + + let bitmap_tags_green = index + .facet_id_is_empty_docids + .get(&rtxn, &BEU16::new(tags_green_id)) + .unwrap() + .unwrap(); + assert_eq!(bitmap_tags_green.into_iter().collect::>(), vec![8]); + + let bitmap_tags_blue = index + .facet_id_is_empty_docids + .get(&rtxn, &BEU16::new(tags_blue_id)) + .unwrap() + .unwrap(); + assert_eq!(bitmap_tags_blue.into_iter().collect::>(), vec![12]); + }; + + let faceted_fields = hashset!(S("tags")); + + let index = TempIndex::new(); + index.add_documents(content()).unwrap(); + index + .update_settings(|settings| { + settings.set_filterable_fields(faceted_fields.clone()); + }) + .unwrap(); + check_ok(&index); + + let index = TempIndex::new(); + index + .update_settings(|settings| { + settings.set_filterable_fields(faceted_fields.clone()); + }) + .unwrap(); + index.add_documents(content()).unwrap(); + check_ok(&index); + } + #[test] fn primary_key_must_not_contain_floats() { let index = TempIndex::new_with_map_size(4096 * 100); diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index b9b11cfa8..e1fc01ca9 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -39,6 +39,8 @@ pub(crate) enum TypedChunk { FieldIdFacetStringDocids(grenad::Reader), FieldIdFacetNumberDocids(grenad::Reader), FieldIdFacetExistsDocids(grenad::Reader), + FieldIdFacetIsNullDocids(grenad::Reader), + FieldIdFacetIsEmptyDocids(grenad::Reader), GeoPoints(grenad::Reader), ScriptLanguageDocids(HashMap<(Script, Language), RoaringBitmap>), } @@ -161,6 +163,28 @@ pub(crate) fn write_typed_chunk_into_index( )?; is_merged_database = true; } + TypedChunk::FieldIdFacetIsNullDocids(facet_id_is_null_docids) => { + append_entries_into_database( + facet_id_is_null_docids, + &index.facet_id_is_null_docids, + wtxn, + index_is_empty, + |value, _buffer| Ok(value), + merge_cbo_roaring_bitmaps, + )?; + is_merged_database = true; + } + TypedChunk::FieldIdFacetIsEmptyDocids(facet_id_is_empty_docids) => { + append_entries_into_database( + facet_id_is_empty_docids, + &index.facet_id_is_empty_docids, + wtxn, + index_is_empty, + |value, _buffer| Ok(value), + merge_cbo_roaring_bitmaps, + )?; + is_merged_database = true; + } TypedChunk::WordPairProximityDocids(word_pair_proximity_docids_iter) => { append_entries_into_database( word_pair_proximity_docids_iter, diff --git a/milli/tests/search/filters.rs b/milli/tests/search/filters.rs index 18de24ac3..db5a004e0 100644 --- a/milli/tests/search/filters.rs +++ b/milli/tests/search/filters.rs @@ -82,10 +82,23 @@ test_filter!( vec![Left(vec!["tag=red", "tag=green"]), Left(vec!["asc_desc_rank<3", "asc_desc_rank<1"])] ); test_filter!(exists_filter_1, vec![Right("opt1 EXISTS")]); +test_filter!(exists_filter_2, vec![Right("opt1.opt2 EXISTS")]); test_filter!(exists_filter_1_not, vec![Right("opt1 NOT EXISTS")]); test_filter!(exists_filter_1_not_alt, vec![Right("NOT opt1 EXISTS")]); test_filter!(exists_filter_1_double_not, vec![Right("NOT opt1 NOT EXISTS")]); +test_filter!(null_filter_1, vec![Right("opt1 IS NULL")]); +test_filter!(null_filter_2, vec![Right("opt1.opt2 IS NULL")]); +test_filter!(null_filter_1_not, vec![Right("opt1 IS NOT NULL")]); +test_filter!(null_filter_1_not_alt, vec![Right("NOT opt1 IS NULL")]); +test_filter!(null_filter_1_double_not, vec![Right("NOT opt1 IS NOT NULL")]); + +test_filter!(empty_filter_1, vec![Right("opt1 IS EMPTY")]); +test_filter!(empty_filter_2, vec![Right("opt1.opt2 IS EMPTY")]); +test_filter!(empty_filter_1_not, vec![Right("opt1 IS NOT EMPTY")]); +test_filter!(empty_filter_1_not_alt, vec![Right("NOT opt1 IS EMPTY")]); +test_filter!(empty_filter_1_double_not, vec![Right("NOT opt1 IS NOT EMPTY")]); + test_filter!(in_filter, vec![Right("tag_in IN[1, 2, 3, four, five]")]); test_filter!(not_in_filter, vec![Right("tag_in NOT IN[1, 2, 3, four, five]")]); test_filter!(not_not_in_filter, vec![Right("NOT tag_in NOT IN[1, 2, 3, four, five]")]); diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs index 18c74e344..7f072ef95 100644 --- a/milli/tests/search/mod.rs +++ b/milli/tests/search/mod.rs @@ -205,6 +205,30 @@ fn execute_filter(filter: &str, document: &TestDocument) -> Option { } else if let Some(opt1) = &document.opt1 { id = contains_key_rec(opt1, "opt2").then(|| document.id.clone()); } + } else if matches!(filter, "opt1 IS NULL" | "NOT opt1 IS NOT NULL") { + id = document.opt1.as_ref().map_or(false, |v| v.is_null()).then(|| document.id.clone()); + } else if matches!(filter, "NOT opt1 IS NULL" | "opt1 IS NOT NULL") { + id = document.opt1.as_ref().map_or(true, |v| !v.is_null()).then(|| document.id.clone()); + } else if matches!(filter, "opt1.opt2 IS NULL") { + if document.opt1opt2.as_ref().map_or(false, |v| v.is_null()) { + id = Some(document.id.clone()); + } else if let Some(opt1) = &document.opt1 { + if !opt1.is_null() { + id = contains_null_rec(opt1, "opt2").then(|| document.id.clone()); + } + } + } else if matches!(filter, "opt1 IS EMPTY" | "NOT opt1 IS NOT EMPTY") { + id = document.opt1.as_ref().map_or(false, is_empty_value).then(|| document.id.clone()); + } else if matches!(filter, "NOT opt1 IS EMPTY" | "opt1 IS NOT EMPTY") { + id = document + .opt1 + .as_ref() + .map_or(true, |v| !is_empty_value(v)) + .then(|| document.id.clone()); + } else if matches!(filter, "opt1.opt2 IS EMPTY") { + if document.opt1opt2.as_ref().map_or(false, is_empty_value) { + id = Some(document.id.clone()); + } } else if matches!( filter, "tag_in IN[1, 2, 3, four, five]" | "NOT tag_in NOT IN[1, 2, 3, four, five]" @@ -218,6 +242,15 @@ fn execute_filter(filter: &str, document: &TestDocument) -> Option { id } +pub fn is_empty_value(v: &serde_json::Value) -> bool { + match v { + serde_json::Value::String(s) => s.is_empty(), + serde_json::Value::Array(a) => a.is_empty(), + serde_json::Value::Object(o) => o.is_empty(), + _ => false, + } +} + pub fn contains_key_rec(v: &serde_json::Value, key: &str) -> bool { match v { serde_json::Value::Array(v) => { @@ -240,6 +273,28 @@ pub fn contains_key_rec(v: &serde_json::Value, key: &str) -> bool { } } +pub fn contains_null_rec(v: &serde_json::Value, key: &str) -> bool { + match v { + serde_json::Value::Object(v) => { + for (k, v) in v.iter() { + if k == key && v.is_null() || contains_null_rec(v, key) { + return true; + } + } + false + } + serde_json::Value::Array(v) => { + for v in v.iter() { + if contains_null_rec(v, key) { + return true; + } + } + false + } + _ => false, + } +} + pub fn expected_filtered_ids(filters: Vec, &str>>) -> HashSet { let dataset: Vec = serde_json::Deserializer::from_str(CONTENT).into_iter().map(|r| r.unwrap()).collect();