From 9f1fb4b425bbbb78cd0c972add760710696a5d15 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 17 Sep 2024 16:44:11 +0200 Subject: [PATCH] Introduce the STARTS WITH filter operator gated under an experimental feature --- filter-parser/src/condition.rs | 29 ++++++++++++++++++++++++++ filter-parser/src/error.rs | 2 +- filter-parser/src/lib.rs | 35 +++++++++++++++++++++----------- filter-parser/src/value.rs | 2 ++ index-scheduler/src/features.rs | 2 +- milli/src/search/facet/filter.rs | 20 +++++++++++++++++- 6 files changed, 75 insertions(+), 15 deletions(-) diff --git a/filter-parser/src/condition.rs b/filter-parser/src/condition.rs index 679555a89..04b6dc266 100644 --- a/filter-parser/src/condition.rs +++ b/filter-parser/src/condition.rs @@ -27,6 +27,7 @@ pub enum Condition<'a> { LowerThanOrEqual(Token<'a>), Between { from: Token<'a>, to: Token<'a> }, Contains { keyword: Token<'a>, word: Token<'a> }, + StartsWith { keyword: Token<'a>, word: Token<'a> }, } /// condition = value ("==" | ">" ...) value @@ -121,6 +122,34 @@ pub fn parse_not_contains(input: Span) -> IResult { )) } +/// starts with = value "CONTAINS" value +pub fn parse_starts_with(input: Span) -> IResult { + let (input, (fid, starts_with, value)) = + tuple((parse_value, tag("STARTS WITH"), cut(parse_value)))(input)?; + Ok(( + input, + FilterCondition::Condition { + fid, + op: StartsWith { keyword: Token { span: starts_with, value: None }, word: value }, + }, + )) +} + +/// starts with = value "NOT" WS+ "CONTAINS" value +pub fn parse_not_starts_with(input: Span) -> IResult { + let keyword = tuple((tag("NOT"), multispace1, tag("STARTS WITH"))); + let (input, (fid, (_not, _spaces, starts_with), value)) = + tuple((parse_value, keyword, cut(parse_value)))(input)?; + + Ok(( + input, + FilterCondition::Not(Box::new(FilterCondition::Condition { + fid, + op: StartsWith { keyword: Token { span: starts_with, value: None }, word: value }, + })), + )) +} + /// to = value value "TO" WS+ value pub fn parse_to(input: Span) -> IResult { let (input, (key, from, _, _, to)) = diff --git a/filter-parser/src/error.rs b/filter-parser/src/error.rs index f530cc690..122396b87 100644 --- a/filter-parser/src/error.rs +++ b/filter-parser/src/error.rs @@ -146,7 +146,7 @@ impl<'a> Display for Error<'a> { } ErrorKind::InvalidPrimary => { let text = if input.trim().is_empty() { "but instead got nothing.".to_string() } else { format!("at `{}`.", escaped_input) }; - writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `CONTAINS`, `NOT CONTAINS`, `_geoRadius`, or `_geoBoundingBox` {}", text)? + writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `CONTAINS`, `NOT CONTAINS`, `STARTS WITH`, `NOT STARTS WITH`, `_geoRadius`, or `_geoBoundingBox` {}", text)? } ErrorKind::InvalidEscapedNumber => { writeln!(f, "Found an invalid escaped sequence number: `{}`.", escaped_input)? diff --git a/filter-parser/src/lib.rs b/filter-parser/src/lib.rs index d06154f25..9c323660e 100644 --- a/filter-parser/src/lib.rs +++ b/filter-parser/src/lib.rs @@ -49,7 +49,7 @@ use std::fmt::Debug; pub use condition::{parse_condition, parse_to, Condition}; use condition::{ parse_contains, parse_exists, parse_is_empty, parse_is_not_empty, parse_is_not_null, - parse_is_null, parse_not_contains, parse_not_exists, + parse_is_null, parse_not_contains, parse_not_exists, parse_not_starts_with, parse_starts_with, }; use error::{cut_with_err, ExpectedValueKind, NomErrorExt}; pub use error::{Error, ErrorKind}; @@ -166,7 +166,8 @@ impl<'a> FilterCondition<'a> { | Condition::LowerThan(_) | Condition::LowerThanOrEqual(_) | Condition::Between { .. } => None, - Condition::Contains { keyword, word: _ } => Some(keyword), + Condition::Contains { keyword, word: _ } + | Condition::StartsWith { keyword, word: _ } => Some(keyword), }, FilterCondition::Not(this) => this.use_contains_operator(), FilterCondition::Or(seq) | FilterCondition::And(seq) => { @@ -484,6 +485,8 @@ fn parse_primary(input: Span, depth: usize) -> IResult { parse_to, parse_contains, parse_not_contains, + parse_starts_with, + parse_not_starts_with, // the next lines are only for error handling and are written at the end to have the less possible performance impact parse_geo, parse_geo_distance, @@ -567,6 +570,7 @@ impl<'a> std::fmt::Display for Condition<'a> { Condition::LowerThanOrEqual(token) => write!(f, "<= {token}"), Condition::Between { from, to } => write!(f, "{from} TO {to}"), Condition::Contains { word, keyword: _ } => write!(f, "CONTAINS {word}"), + Condition::StartsWith { word, keyword: _ } => write!(f, "STARTS WITH {word}"), } } } @@ -680,6 +684,13 @@ pub mod tests { insta::assert_snapshot!(p("NOT subscribers NOT CONTAINS 'hello'"), @"{subscribers} CONTAINS {hello}"); insta::assert_snapshot!(p("subscribers NOT CONTAINS 'hello'"), @"NOT ({subscribers} CONTAINS {hello})"); + // Test STARTS WITH + NOT STARTS WITH + insta::assert_snapshot!(p("subscribers STARTS WITH 'hel'"), @"{subscribers} STARTS WITH {hel}"); + insta::assert_snapshot!(p("NOT subscribers STARTS WITH 'hel'"), @"NOT ({subscribers} STARTS WITH {hel})"); + insta::assert_snapshot!(p("subscribers NOT STARTS WITH hel"), @"NOT ({subscribers} STARTS WITH {hel})"); + insta::assert_snapshot!(p("NOT subscribers NOT STARTS WITH 'hel'"), @"{subscribers} STARTS WITH {hel}"); + insta::assert_snapshot!(p("subscribers NOT STARTS WITH 'hel'"), @"NOT ({subscribers} STARTS WITH {hel})"); + // Test nested NOT insta::assert_snapshot!(p("NOT NOT NOT NOT x = 5"), @"{x} = {5}"); insta::assert_snapshot!(p("NOT NOT (NOT NOT x = 5)"), @"{x} = {5}"); @@ -851,12 +862,12 @@ pub mod tests { "###); insta::assert_snapshot!(p("colour NOT EXIST"), @r###" - Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `CONTAINS`, `NOT CONTAINS`, `_geoRadius`, or `_geoBoundingBox` at `colour NOT EXIST`. + Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `CONTAINS`, `NOT CONTAINS`, `STARTS WITH`, `NOT STARTS WITH`, `_geoRadius`, or `_geoBoundingBox` at `colour NOT EXIST`. 1:17 colour NOT EXIST "###); insta::assert_snapshot!(p("subscribers 100 TO1000"), @r###" - Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `CONTAINS`, `NOT CONTAINS`, `_geoRadius`, or `_geoBoundingBox` at `subscribers 100 TO1000`. + Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `CONTAINS`, `NOT CONTAINS`, `STARTS WITH`, `NOT STARTS WITH`, `_geoRadius`, or `_geoBoundingBox` at `subscribers 100 TO1000`. 1:23 subscribers 100 TO1000 "###); @@ -919,35 +930,35 @@ pub mod tests { "###); insta::assert_snapshot!(p(r#"value NULL"#), @r###" - Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `CONTAINS`, `NOT CONTAINS`, `_geoRadius`, or `_geoBoundingBox` at `value NULL`. + Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `CONTAINS`, `NOT CONTAINS`, `STARTS WITH`, `NOT STARTS WITH`, `_geoRadius`, or `_geoBoundingBox` at `value NULL`. 1:11 value NULL "###); insta::assert_snapshot!(p(r#"value NOT NULL"#), @r###" - Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `CONTAINS`, `NOT CONTAINS`, `_geoRadius`, or `_geoBoundingBox` at `value NOT NULL`. + Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `CONTAINS`, `NOT CONTAINS`, `STARTS WITH`, `NOT STARTS WITH`, `_geoRadius`, or `_geoBoundingBox` at `value NOT NULL`. 1:15 value NOT NULL "###); insta::assert_snapshot!(p(r#"value EMPTY"#), @r###" - Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `CONTAINS`, `NOT CONTAINS`, `_geoRadius`, or `_geoBoundingBox` at `value EMPTY`. + Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `CONTAINS`, `NOT CONTAINS`, `STARTS WITH`, `NOT STARTS WITH`, `_geoRadius`, or `_geoBoundingBox` at `value EMPTY`. 1:12 value EMPTY "###); insta::assert_snapshot!(p(r#"value NOT EMPTY"#), @r###" - Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `CONTAINS`, `NOT CONTAINS`, `_geoRadius`, or `_geoBoundingBox` at `value NOT EMPTY`. + Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `CONTAINS`, `NOT CONTAINS`, `STARTS WITH`, `NOT STARTS WITH`, `_geoRadius`, or `_geoBoundingBox` at `value NOT EMPTY`. 1:16 value NOT EMPTY "###); insta::assert_snapshot!(p(r#"value IS"#), @r###" - Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `CONTAINS`, `NOT CONTAINS`, `_geoRadius`, or `_geoBoundingBox` at `value IS`. + Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `CONTAINS`, `NOT CONTAINS`, `STARTS WITH`, `NOT STARTS WITH`, `_geoRadius`, or `_geoBoundingBox` at `value IS`. 1:9 value IS "###); insta::assert_snapshot!(p(r#"value IS NOT"#), @r###" - Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `CONTAINS`, `NOT CONTAINS`, `_geoRadius`, or `_geoBoundingBox` at `value IS NOT`. + Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `CONTAINS`, `NOT CONTAINS`, `STARTS WITH`, `NOT STARTS WITH`, `_geoRadius`, or `_geoBoundingBox` at `value IS NOT`. 1:13 value IS NOT "###); insta::assert_snapshot!(p(r#"value IS EXISTS"#), @r###" - Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `CONTAINS`, `NOT CONTAINS`, `_geoRadius`, or `_geoBoundingBox` at `value IS EXISTS`. + Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `CONTAINS`, `NOT CONTAINS`, `STARTS WITH`, `NOT STARTS WITH`, `_geoRadius`, or `_geoBoundingBox` at `value IS EXISTS`. 1:16 value IS EXISTS "###); insta::assert_snapshot!(p(r#"value IS NOT EXISTS"#), @r###" - Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `CONTAINS`, `NOT CONTAINS`, `_geoRadius`, or `_geoBoundingBox` at `value IS NOT EXISTS`. + Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `CONTAINS`, `NOT CONTAINS`, `STARTS WITH`, `NOT STARTS WITH`, `_geoRadius`, or `_geoBoundingBox` at `value IS NOT EXISTS`. 1:20 value IS NOT EXISTS "###); } diff --git a/filter-parser/src/value.rs b/filter-parser/src/value.rs index 06ec1daef..5912f6900 100644 --- a/filter-parser/src/value.rs +++ b/filter-parser/src/value.rs @@ -212,6 +212,8 @@ fn is_keyword(s: &str) -> bool { | "NULL" | "EMPTY" | "CONTAINS" + | "STARTS" + | "WITH" | "_geoRadius" | "_geoBoundingBox" ) diff --git a/index-scheduler/src/features.rs b/index-scheduler/src/features.rs index c998ff444..f4ac80511 100644 --- a/index-scheduler/src/features.rs +++ b/index-scheduler/src/features.rs @@ -87,7 +87,7 @@ impl RoFeatures { Ok(()) } else { Err(FeatureNotEnabledError { - disabled_action: "Using `CONTAINS` in a filter", + disabled_action: "Using `CONTAINS` or `STARTS WITH` in a filter", feature: "contains filter", issue_link: "https://github.com/orgs/meilisearch/discussions/763", } diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index 9ce201aca..c059d2d27 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -12,7 +12,7 @@ use serde_json::Value; use super::facet_range_search; use crate::error::{Error, UserError}; use crate::heed_codec::facet::{ - FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec, OrderedF64Codec, + FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, OrderedF64Codec, }; use crate::index::db_name::FACET_ID_STRING_DOCIDS; use crate::{ @@ -336,6 +336,24 @@ impl<'a> Filter<'a> { return Ok(docids); } + Condition::StartsWith { keyword: _, word } => { + let value = crate::normalize_facet(word.value()); + let base = FacetGroupKey { field_id, level: 0, left_bound: value.as_str() }; + let docids = strings_db + .prefix_iter(rtxn, &base)? + .map(|result| -> Result { + match result { + Ok((_facet_group_key, FacetGroupValue { bitmap, .. })) => Ok(bitmap), + Err(_e) => Err(InternalError::from(SerializationError::Decoding { + db_name: Some(FACET_ID_STRING_DOCIDS), + }) + .into()), + } + }) + .union()?; + + return Ok(docids); + } }; let mut output = RoaringBitmap::new();