From 93285041a93725eec7fd97f7f31554c6c17459bb Mon Sep 17 00:00:00 2001 From: curquiza Date: Wed, 6 Sep 2023 09:23:20 +0000 Subject: [PATCH 1/7] Update version for the next release (v1.3.3) in Cargo.toml --- Cargo.lock | 28 ++++++++++++++-------------- Cargo.toml | 2 +- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 1ab4e53d0..c1392f4a3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -469,7 +469,7 @@ checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b" [[package]] name = "benchmarks" -version = "1.3.2" +version = "1.3.3" dependencies = [ "anyhow", "bytes", @@ -1199,7 +1199,7 @@ dependencies = [ [[package]] name = "dump" -version = "1.3.2" +version = "1.3.3" dependencies = [ "anyhow", "big_s", @@ -1413,7 +1413,7 @@ dependencies = [ [[package]] name = "file-store" -version = "1.3.2" +version = "1.3.3" dependencies = [ "faux", "tempfile", @@ -1435,7 +1435,7 @@ dependencies = [ [[package]] name = "filter-parser" -version = "1.3.2" +version = "1.3.3" dependencies = [ "insta", "nom", @@ -1454,7 +1454,7 @@ dependencies = [ [[package]] name = "flatten-serde-json" -version = "1.3.2" +version = "1.3.3" dependencies = [ "criterion", "serde_json", @@ -1572,7 +1572,7 @@ dependencies = [ [[package]] name = "fuzzers" -version = "1.3.2" +version = "1.3.3" dependencies = [ "arbitrary", "clap", @@ -1894,7 +1894,7 @@ dependencies = [ [[package]] name = "index-scheduler" -version = "1.3.2" +version = "1.3.3" dependencies = [ "anyhow", "big_s", @@ -2081,7 +2081,7 @@ dependencies = [ [[package]] name = "json-depth-checker" -version = "1.3.2" +version = "1.3.3" dependencies = [ "criterion", "serde_json", @@ -2493,7 +2493,7 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771" [[package]] name = "meili-snap" -version = "1.3.2" +version = "1.3.3" dependencies = [ "insta", "md5", @@ -2502,7 +2502,7 @@ dependencies = [ [[package]] name = "meilisearch" -version = "1.3.2" +version = "1.3.3" dependencies = [ "actix-cors", "actix-http", @@ -2591,7 +2591,7 @@ dependencies = [ [[package]] name = "meilisearch-auth" -version = "1.3.2" +version = "1.3.3" dependencies = [ "base64 0.21.2", "enum-iterator", @@ -2610,7 +2610,7 @@ dependencies = [ [[package]] name = "meilisearch-types" -version = "1.3.2" +version = "1.3.3" dependencies = [ "actix-web", "anyhow", @@ -2664,7 +2664,7 @@ dependencies = [ [[package]] name = "milli" -version = "1.3.2" +version = "1.3.3" dependencies = [ "big_s", "bimap", @@ -2994,7 +2994,7 @@ checksum = "478c572c3d73181ff3c2539045f6eb99e5491218eae919370993b890cdbdd98e" [[package]] name = "permissive-json-pointer" -version = "1.3.2" +version = "1.3.3" dependencies = [ "big_s", "serde_json", diff --git a/Cargo.toml b/Cargo.toml index 98b484ee4..afa79af5f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,7 +18,7 @@ members = [ ] [workspace.package] -version = "1.3.2" +version = "1.3.3" authors = ["Quentin de Quelen ", "Clément Renault "] description = "Meilisearch HTTP server" homepage = "https://meilisearch.com" From e02d0064bd620b02e9d00772189c9fd772aa8cfa Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 6 Sep 2023 11:49:27 +0200 Subject: [PATCH 2/7] Add a test case scenario --- milli/src/update/index_documents/mod.rs | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 849e84035..3704faf44 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -2519,6 +2519,25 @@ mod tests { db_snap!(index, word_position_docids, 3, @"74f556b91d161d997a89468b4da1cb8f"); } + /// Index multiple different number of vectors in documents. + /// Vectors must be of the same length. + #[test] + fn test_multiple_vectors() { + let index = TempIndex::new(); + + index.add_documents(documents!([{"id": 0, "_vectors": [[0, 1, 2], [3, 4, 5]] }])).unwrap(); + index.add_documents(documents!([{"id": 1, "_vectors": [6, 7, 8] }])).unwrap(); + index + .add_documents( + documents!([{"id": 2, "_vectors": [[9, 10, 11], [12, 13, 14], [15, 16, 17]] }]), + ) + .unwrap(); + + let rtxn = index.read_txn().unwrap(); + let res = index.search(&rtxn).vector([0.0, 1.0, 2.0]).execute().unwrap(); + assert_eq!(res.documents_ids.len(), 3); + } + #[test] fn reproduce_the_bug() { /* From 679c0b0f970c111b3c86309354bb82ce6be47e3a Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 6 Sep 2023 12:20:25 +0200 Subject: [PATCH 3/7] Extract the vectors from the non-flattened version of the documents --- .../src/update/index_documents/extract/mod.rs | 51 +++++++++++-------- 1 file changed, 29 insertions(+), 22 deletions(-) diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 6259c7272..6a3d6d972 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -55,7 +55,13 @@ pub(crate) fn data_from_obkv_documents( original_obkv_chunks .par_bridge() .map(|original_documents_chunk| { - send_original_documents_data(original_documents_chunk, lmdb_writer_sx.clone()) + send_original_documents_data( + original_documents_chunk, + indexer, + lmdb_writer_sx.clone(), + vectors_field_id, + primary_key_id, + ) }) .collect::>()?; @@ -72,7 +78,6 @@ pub(crate) fn data_from_obkv_documents( &faceted_fields, primary_key_id, geo_fields_ids, - vectors_field_id, &stop_words, max_positions_per_attributes, ) @@ -257,11 +262,33 @@ fn spawn_extraction_task( /// - documents fn send_original_documents_data( original_documents_chunk: Result>, + indexer: GrenadParameters, lmdb_writer_sx: Sender>, + vectors_field_id: Option, + primary_key_id: FieldId, ) -> Result<()> { let original_documents_chunk = original_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?; + if let Some(vectors_field_id) = vectors_field_id { + let documents_chunk_cloned = original_documents_chunk.clone(); + let lmdb_writer_sx_cloned = lmdb_writer_sx.clone(); + rayon::spawn(move || { + let result = extract_vector_points( + documents_chunk_cloned, + indexer, + primary_key_id, + vectors_field_id, + ); + let _ = match result { + Ok(vector_points) => { + lmdb_writer_sx_cloned.send(Ok(TypedChunk::VectorPoints(vector_points))) + } + Err(error) => lmdb_writer_sx_cloned.send(Err(error)), + }; + }); + } + // TODO: create a custom internal error lmdb_writer_sx.send(Ok(TypedChunk::Documents(original_documents_chunk))).unwrap(); Ok(()) @@ -283,7 +310,6 @@ fn send_and_extract_flattened_documents_data( faceted_fields: &HashSet, primary_key_id: FieldId, geo_fields_ids: Option<(FieldId, FieldId)>, - vectors_field_id: Option, stop_words: &Option>, max_positions_per_attributes: Option, ) -> Result<( @@ -312,25 +338,6 @@ fn send_and_extract_flattened_documents_data( }); } - if let Some(vectors_field_id) = vectors_field_id { - let documents_chunk_cloned = flattened_documents_chunk.clone(); - let lmdb_writer_sx_cloned = lmdb_writer_sx.clone(); - rayon::spawn(move || { - let result = extract_vector_points( - documents_chunk_cloned, - indexer, - primary_key_id, - vectors_field_id, - ); - let _ = match result { - Ok(vector_points) => { - lmdb_writer_sx_cloned.send(Ok(TypedChunk::VectorPoints(vector_points))) - } - Err(error) => lmdb_writer_sx_cloned.send(Err(error)), - }; - }); - } - let (docid_word_positions_chunk, docid_fid_facet_values_chunks): (Result<_>, Result<_>) = rayon::join( || { From b42d48187a84e3e8acd7e337ccd5755967a2ecbe Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 6 Sep 2023 11:39:06 +0200 Subject: [PATCH 4/7] Add a test case scenario --- filter-parser/src/lib.rs | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/filter-parser/src/lib.rs b/filter-parser/src/lib.rs index 11ffbb148..5760c8865 100644 --- a/filter-parser/src/lib.rs +++ b/filter-parser/src/lib.rs @@ -545,6 +545,8 @@ impl<'a> std::fmt::Display for Token<'a> { #[cfg(test)] pub mod tests { + use FilterCondition as Fc; + use super::*; /// Create a raw [Token]. You must specify the string that appear BEFORE your element followed by your element @@ -556,14 +558,22 @@ pub mod tests { unsafe { Span::new_from_raw_offset(offset, lines as u32, value, "") }.into() } + fn p(s: &str) -> impl std::fmt::Display + '_ { + Fc::parse(s).unwrap().unwrap() + } + + #[test] + fn parse_escaped() { + insta::assert_display_snapshot!(p(r#"title = 'foo\\'"#), @r#"{title} = {foo\}"#); + insta::assert_display_snapshot!(p(r#"title = 'foo\\\\'"#), @r#"{title} = {foo\\}"#); + insta::assert_display_snapshot!(p(r#"title = 'foo\\\\\\'"#), @r#"{title} = {foo\\\}"#); + insta::assert_display_snapshot!(p(r#"title = 'foo\\\\\\\\'"#), @r#"{title} = {foo\\\\}"#); + // but it also works with other sequencies + insta::assert_display_snapshot!(p(r#"title = 'foo\x20\n\t\"\'"'"#), @"{title} = {foo \n\t\"\'\"}"); + } + #[test] fn parse() { - use FilterCondition as Fc; - - fn p(s: &str) -> impl std::fmt::Display + '_ { - Fc::parse(s).unwrap().unwrap() - } - // Test equal insta::assert_display_snapshot!(p("channel = Ponce"), @"{channel} = {Ponce}"); insta::assert_display_snapshot!(p("subscribers = 12"), @"{subscribers} = {12}"); From ea7806091639da9d702550c9940e63dad2da4795 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 6 Sep 2023 11:39:36 +0200 Subject: [PATCH 5/7] Fix tests that were supposed to escape characters --- filter-parser/src/value.rs | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/filter-parser/src/value.rs b/filter-parser/src/value.rs index 735352fc3..932aabca9 100644 --- a/filter-parser/src/value.rs +++ b/filter-parser/src/value.rs @@ -318,17 +318,17 @@ pub mod test { ("\"cha'nnel\"", "cha'nnel", false), ("I'm tamo", "I", false), // escaped thing but not quote - (r#""\\""#, r#"\\"#, false), - (r#""\\\\\\""#, r#"\\\\\\"#, false), - (r#""aa\\aa""#, r#"aa\\aa"#, false), + (r#""\\""#, r#"\"#, true), + (r#""\\\\\\""#, r#"\\\"#, true), + (r#""aa\\aa""#, r#"aa\aa"#, true), // with double quote (r#""Hello \"world\"""#, r#"Hello "world""#, true), - (r#""Hello \\\"world\\\"""#, r#"Hello \\"world\\""#, true), + (r#""Hello \\\"world\\\"""#, r#"Hello \"world\""#, true), (r#""I'm \"super\" tamo""#, r#"I'm "super" tamo"#, true), (r#""\"\"""#, r#""""#, true), // with simple quote (r#"'Hello \'world\''"#, r#"Hello 'world'"#, true), - (r#"'Hello \\\'world\\\''"#, r#"Hello \\'world\\'"#, true), + (r#"'Hello \\\'world\\\''"#, r#"Hello \'world\'"#, true), (r#"'I\'m "super" tamo'"#, r#"I'm "super" tamo"#, true), (r#"'\'\''"#, r#"''"#, true), ]; @@ -350,7 +350,14 @@ pub mod test { "Filter `{}` was not supposed to be escaped", input ); - assert_eq!(token.value(), expected, "Filter `{}` failed.", input); + assert_eq!( + token.value(), + expected, + "Filter `{}` failed by giving `{}` instead of `{}`.", + input, + token.value(), + expected + ); } } From 03d0f628bdf6ba9cbe29d427c6001d64b253e2c0 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 6 Sep 2023 11:40:21 +0200 Subject: [PATCH 6/7] Use the unescaper crate to unescape any char sequence --- Cargo.lock | 10 ++++++++++ filter-parser/Cargo.toml | 1 + filter-parser/src/error.rs | 4 ++++ filter-parser/src/value.rs | 19 ++++++++++++++++++- 4 files changed, 33 insertions(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index 4aa7c7b67..26e0001b9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1440,6 +1440,7 @@ dependencies = [ "insta", "nom", "nom_locate", + "unescaper", ] [[package]] @@ -4147,6 +4148,15 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9e79c4d996edb816c91e4308506774452e55e95c3c9de07b6729e17e15a5ef81" +[[package]] +name = "unescaper" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a96a44ae11e25afb520af4534fd7b0bd8cd613e35a78def813b8cf41631fa3c8" +dependencies = [ + "thiserror", +] + [[package]] name = "unicase" version = "2.6.0" diff --git a/filter-parser/Cargo.toml b/filter-parser/Cargo.toml index 58111ee08..d9b47f638 100644 --- a/filter-parser/Cargo.toml +++ b/filter-parser/Cargo.toml @@ -14,6 +14,7 @@ license.workspace = true [dependencies] nom = "7.1.3" nom_locate = "4.1.0" +unescaper = "0.1.2" [dev-dependencies] insta = "1.29.0" diff --git a/filter-parser/src/error.rs b/filter-parser/src/error.rs index b9d14a271..c71a3aaee 100644 --- a/filter-parser/src/error.rs +++ b/filter-parser/src/error.rs @@ -62,6 +62,7 @@ pub enum ErrorKind<'a> { MisusedGeoRadius, MisusedGeoBoundingBox, InvalidPrimary, + InvalidEscapedNumber, ExpectedEof, ExpectedValue(ExpectedValueKind), MalformedValue, @@ -147,6 +148,9 @@ impl<'a> Display for Error<'a> { let text = if input.trim().is_empty() { "but instead got nothing.".to_string() } else { format!("at `{}`.", escaped_input) }; writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` {}", text)? } + ErrorKind::InvalidEscapedNumber => { + writeln!(f, "Found an invalid escaped sequence number: `{}`.", escaped_input)? + } ErrorKind::ExpectedEof => { writeln!(f, "Found unexpected characters at the end of the filter: `{}`. You probably forgot an `OR` or an `AND` rule.", escaped_input)? } diff --git a/filter-parser/src/value.rs b/filter-parser/src/value.rs index 932aabca9..63d5ac384 100644 --- a/filter-parser/src/value.rs +++ b/filter-parser/src/value.rs @@ -171,7 +171,24 @@ pub fn parse_value(input: Span) -> IResult { }) })?; - Ok((input, value)) + match unescaper::unescape(value.value()) { + Ok(content) => { + if content.len() != value.value().len() { + Ok((input, Token::new(value.original_span(), Some(content)))) + } else { + Ok((input, value)) + } + } + Err(unescaper::Error::IncompleteStr(_)) => Err(nom::Err::Incomplete(nom::Needed::Unknown)), + Err(unescaper::Error::ParseIntError { .. }) => Err(nom::Err::Error(Error::new_from_kind( + value.original_span(), + ErrorKind::InvalidEscapedNumber, + ))), + Err(unescaper::Error::InvalidChar { .. }) => Err(nom::Err::Error(Error::new_from_kind( + value.original_span(), + ErrorKind::MalformedValue, + ))), + } } fn is_value_component(c: char) -> bool { From 651657c03e71992e8dbafd17e727b7d63fc6b102 Mon Sep 17 00:00:00 2001 From: curquiza Date: Thu, 7 Sep 2023 16:48:13 +0200 Subject: [PATCH 7/7] Fix git conflicts --- Cargo.lock | 28 ++++++++++++++-------------- Cargo.toml | 2 +- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e486648f3..03d491ff5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -469,7 +469,7 @@ checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b" [[package]] name = "benchmarks" -version = "1.3.3" +version = "1.4.0" dependencies = [ "anyhow", "bytes", @@ -1199,7 +1199,7 @@ dependencies = [ [[package]] name = "dump" -version = "1.3.3" +version = "1.4.0" dependencies = [ "anyhow", "big_s", @@ -1413,7 +1413,7 @@ dependencies = [ [[package]] name = "file-store" -version = "1.3.3" +version = "1.4.0" dependencies = [ "faux", "tempfile", @@ -1435,7 +1435,7 @@ dependencies = [ [[package]] name = "filter-parser" -version = "1.3.3" +version = "1.4.0" dependencies = [ "insta", "nom", @@ -1455,7 +1455,7 @@ dependencies = [ [[package]] name = "flatten-serde-json" -version = "1.3.3" +version = "1.4.0" dependencies = [ "criterion", "serde_json", @@ -1573,7 +1573,7 @@ dependencies = [ [[package]] name = "fuzzers" -version = "1.3.3" +version = "1.4.0" dependencies = [ "arbitrary", "clap", @@ -1895,7 +1895,7 @@ dependencies = [ [[package]] name = "index-scheduler" -version = "1.3.3" +version = "1.4.0" dependencies = [ "anyhow", "big_s", @@ -2082,7 +2082,7 @@ dependencies = [ [[package]] name = "json-depth-checker" -version = "1.3.3" +version = "1.4.0" dependencies = [ "criterion", "serde_json", @@ -2494,7 +2494,7 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771" [[package]] name = "meili-snap" -version = "1.3.3" +version = "1.4.0" dependencies = [ "insta", "md5", @@ -2503,7 +2503,7 @@ dependencies = [ [[package]] name = "meilisearch" -version = "1.3.3" +version = "1.4.0" dependencies = [ "actix-cors", "actix-http", @@ -2592,7 +2592,7 @@ dependencies = [ [[package]] name = "meilisearch-auth" -version = "1.3.3" +version = "1.4.0" dependencies = [ "base64 0.21.2", "enum-iterator", @@ -2611,7 +2611,7 @@ dependencies = [ [[package]] name = "meilisearch-types" -version = "1.3.3" +version = "1.4.0" dependencies = [ "actix-web", "anyhow", @@ -2665,7 +2665,7 @@ dependencies = [ [[package]] name = "milli" -version = "1.3.3" +version = "1.4.0" dependencies = [ "big_s", "bimap", @@ -2995,7 +2995,7 @@ checksum = "478c572c3d73181ff3c2539045f6eb99e5491218eae919370993b890cdbdd98e" [[package]] name = "permissive-json-pointer" -version = "1.3.3" +version = "1.4.0" dependencies = [ "big_s", "serde_json", diff --git a/Cargo.toml b/Cargo.toml index afa79af5f..9c89aadfb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,7 +18,7 @@ members = [ ] [workspace.package] -version = "1.3.3" +version = "1.4.0" authors = ["Quentin de Quelen ", "Clément Renault "] description = "Meilisearch HTTP server" homepage = "https://meilisearch.com"